From 8cf833c7290ad1a6179650bf11700d1d71616f62 Mon Sep 17 00:00:00 2001
From: Mark Randall Havens <mark@thefoldwithin.earth>
Date: Fri, 13 Mar 2026 20:50:36 +0000
Subject: [PATCH] feat: Content-Based Purpose Inference
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Added content_infer.py - analyzes existing content to infer purpose:

- ContentPurposeInferer class
- Analyzes blog posts, articles, text
- Detects signals: tutorials, explainers, transformation stories, etc.
- Returns purpose, confidence, reasoning

Updated intake.py to weight all signals:
1. Explicit flags (weight: 1.0)
2. Content inference (weight: 0.4) - NEW
3. Keyword classification (weight: 0.3)
4. Conversational (weight: 0.5)

Now if you point at a blog:
- Tutorial posts → LEARN_HANDS_ON
- Explainers → UNDERSTAND
- Transformation stories → TRANSFORM
- Reviews/Comparisons → DECIDE
- Reference docs → REFERENCE
- Journey/Biography → BE_INSPIRED
---
 opus_orchestrator/nonfiction/__init__.py      |  10 +
 opus_orchestrator/nonfiction/content_infer.py | 208 ++++++++++++++++++
 opus_orchestrator/nonfiction/intake.py        | 114 +++++++++-
 3 files changed, 320 insertions(+), 12 deletions(-)
 create mode 100644 opus_orchestrator/nonfiction/content_infer.py

diff --git a/opus_orchestrator/nonfiction/__init__.py b/opus_orchestrator/nonfiction/__init__.py
index 1330dc6..a64da98 100644
--- a/opus_orchestrator/nonfiction/__init__.py
+++ b/opus_orchestrator/nonfiction/__init__.py
@@ -3,6 +3,7 @@
 Key components:
 - classifier: Classifies user input into ReaderPurpose
 - intake: Conversational intake agent for high-fidelity intent
+- content_infer: Infers purpose from existing blog/content
 """
 
 from opus_orchestrator.nonfiction.classifier import (
@@ -18,6 +19,11 @@ from opus_orchestrator.nonfiction.intake import (
     IntakeMode,
     determine_intake,
 )
+from opus_orchestrator.nonfiction.content_infer import (
+    ContentPurposeInferer,
+    ContentAnalysis,
+    infer_purpose_from_content,
+)
 
 __all__ = [
     # Classifier
@@ -31,4 +37,8 @@ __all__ = [
     "IntakeResult",
     "IntakeMode",
     "determine_intake",
+    # Content Inference
+    "ContentPurposeInferer",
+    "ContentAnalysis",
+    "infer_purpose_from_content",
 ]
diff --git a/opus_orchestrator/nonfiction/content_infer.py b/opus_orchestrator/nonfiction/content_infer.py
new file mode 100644
index 0000000..94fb2b8
--- /dev/null
+++ b/opus_orchestrator/nonfiction/content_infer.py
@@ -0,0 +1,208 @@
+"""Content-Based Purpose Inference.
+
+Analyzes existing content to infer the reader purpose.
+This allows the system to determine purpose from blog posts, articles, etc.
+"""
+
+from dataclasses import dataclass
+from typing import Optional
+
+from opus_orchestrator.nonfiction.classifier import ReaderPurpose
+
+
+@dataclass
+class ContentAnalysis:
+    """Result of analyzing content for purpose."""
+    purpose: ReaderPurpose
+    confidence: float
+    reasoning: str
+    signals: dict
+
+
+class ContentPurposeInferer:
+    """Infers reader purpose from existing content.
+    
+    Analyzes blog posts, articles, or other content to determine
+    what kind of book this content would become.
+    """
+    
+    # Content patterns that indicate purpose
+    CONTENT_SIGNALS = {
+        ReaderPurpose.LEARN_HANDS_ON: {
+            "indicators": [
+                "step by step", "how to", "tutorial", "guide to",
+                "instructions", "learn to", "course", "workshop",
+                "example code", "exercise", "practice", "build a",
+                "create a", "implement", "getting started",
+            ],
+            "structure": ["step", "chapter", "lesson", "module", "exercise"],
+        },
+        ReaderPurpose.UNDERSTAND: {
+            "indicators": [
+                "why", "explains", "understand", "concept of",
+                "the nature of", "how it works", "mechanism",
+                "deep dive", "analysis", "framework", "principles",
+                "mental model", "theory", "psychology", "science",
+            ],
+            "structure": ["overview", "background", "core concepts", "implications"],
+        },
+        ReaderPurpose.TRANSFORM: {
+            "indicators": [
+                "i was", "i became", "my journey", "transformation",
+                "overcoming", "struggle", "breakthrough", "changed my life",
+                "how i", "from", "to", "becoming", "awakening",
+                "healing", "recovery", "manifest", "empower",
+            ],
+            "structure": ["before", "after", "journey", "struggle", "triumph"],
+        },
+        ReaderPurpose.DECIDE: {
+            "indicators": [
+                "compared to", "versus", "pros and cons", "should you",
+                "which is better", "is it worth", "decision", "choose",
+                "analysis", "recommendation", "best", "top", "ranking",
+                "tradeoff", "evaluation", "case study",
+            ],
+            "structure": ["comparison", "versus", "pros", "cons", "verdict"],
+        },
+        ReaderPurpose.REFERENCE: {
+            "indicators": [
+                "reference", "documentation", "api", "specification",
+                "manual", "handbook", "comprehensive", "complete guide",
+                "all about", "definitive", "index", "table of contents",
+            ],
+            "structure": ["reference", "api", "syntax", "parameters", "examples"],
+        },
+        ReaderPurpose.BE_INSPIRED: {
+            "indicators": [
+                "story", "journey", "triumph", "against all odds",
+                "inspiration", "motivation", "life lesson", "wisdom",
+                "legacy", "calling", "warrior", "hero", "unstoppable",
+            ],
+            "structure": ["chapter one", "the beginning", "the end", "epilogue"],
+        },
+    }
+    
+    # Negative signals (reduce confidence)
+    NEGATION_PATTERNS = {
+        ReaderPurpose.LEARN_HANDS_ON: ["theory", "why", "explain", "concept"],
+        ReaderPurpose.TRANSFORM: ["reference", "documentation", "api"],
+    }
+    
+    def analyze(
+        self,
+        content: str,
+        title: str = "",
+        meta_description: str = "",
+    ) -> ContentAnalysis:
+        """Analyze content to infer purpose.
+        
+        Args:
+            content: The text content to analyze
+            title: Title of the content
+            meta_description: Meta description if available
+            
+        Returns:
+            ContentAnalysis with inferred purpose
+        """
+        # Combine all text
+        full_text = f"{title} {meta_description} {content}".lower()
+        
+        # Score each purpose
+        scores: dict[ReaderPurpose, float] = {p: 0.0 for p in ReaderPurpose}
+        signal_counts: dict[ReaderPurpose, list[str]] = {p: [] for p in ReaderPurpose}
+        
+        for purpose, patterns in self.CONTENT_SIGNALS.items():
+            # Count indicator matches
+            for indicator in patterns["indicators"]:
+                if indicator.lower() in full_text:
+                    scores[purpose] += 1.0
+                    signal_counts[purpose].append(indicator)
+            
+            # Check structure patterns
+            for structure in patterns.get("structure", []):
+                if structure.lower() in full_text:
+                    scores[purpose] += 0.5
+        
+        # Apply negations (reduce scores)
+        for purpose, negations in self.NEGATION_PATTERNS.items():
+            for negation in negations:
+                if negation.lower() in full_text:
+                    scores[purpose] = max(0, scores[purpose] - 0.5)
+        
+        # Normalize scores
+        total_score = sum(scores.values())
+        if total_score > 0:
+            normalized = {p: s / total_score for p, s in scores.items()}
+        else:
+            normalized = {p: 0.1 for p in ReaderPurpose}  # Uniform if no matches
+        
+        # Find best match
+        best_purpose = max(normalized, key=normalized.get)
+        best_score = normalized[best_purpose]
+        
+        # Calculate confidence
+        if best_score > 0.5:
+            confidence = min(0.95, 0.5 + best_score * 0.5)
+        elif best_score > 0.2:
+            confidence = min(0.7, 0.3 + best_score * 0.4)
+        else:
+            confidence = 0.3
+        
+        # Build reasoning
+        signals = signal_counts[best_purpose]
+        if signals:
+            reasoning = f"Content signals: {', '.join(signals[:5])}"
+        else:
+            reasoning = "No strong signals - purpose unclear"
+        
+        return ContentAnalysis(
+            purpose=best_purpose,
+            confidence=confidence,
+            reasoning=reasoning,
+            signals={p.value: c for p, c in signal_counts.items() if c},
+        )
+    
+    def infer_from_blog(self, blog_posts: list[dict]) -> ContentAnalysis:
+        """Infer purpose from multiple blog posts.
+        
+        Args:
+            blog_posts: List of dicts with 'title', 'content', 'excerpt'
+            
+        Returns:
+            Aggregated ContentAnalysis
+        """
+        all_text = ""
+        titles = []
+        
+        for post in blog_posts:
+            all_text += post.get("content", "") + " "
+            all_text += post.get("excerpt", "") + " "
+            titles.append(post.get("title", ""))
+        
+        result = self.analyze(all_text, title="; ".join(titles))
+        
+        # If multiple posts, boost confidence slightly
+        if len(blog_posts) > 3:
+            result.confidence = min(0.95, result.confidence + 0.1)
+        
+        return result
+
+
+# Convenience function
+def infer_purpose_from_content(
+    content: str,
+    title: str = "",
+    meta_description: str = "",
+) -> ContentAnalysis:
+    """Convenience function to infer purpose from content.
+    
+    Args:
+        content: The text content
+        title: Title of the content
+        meta_description: Optional meta description
+        
+    Returns:
+        ContentAnalysis with inferred purpose
+    """
+    inferer = ContentPurposeInferer()
+    return inferer.analyze(content, title, meta_description)
diff --git a/opus_orchestrator/nonfiction/intake.py b/opus_orchestrator/nonfiction/intake.py
index bcf276c..02ee609 100644
--- a/opus_orchestrator/nonfiction/intake.py
+++ b/opus_orchestrator/nonfiction/intake.py
@@ -6,7 +6,8 @@ by asking clarifying questions or using available signals.
 This agent intelligently combines:
 1. Explicit user flags (--purpose learn)
 2. Keyword classification from concept
-3. Conversational intake (asking questions)
+3. Content inference from existing blog/posts
+4. Conversational intake (asking questions)
 
 The agent weights all inputs to make the best decision.
 """
@@ -16,6 +17,7 @@ from enum import Enum
 from typing import Optional
 
 from opus_orchestrator.nonfiction.classifier import PurposeClassifier, ReaderPurpose
+from opus_orchestrator.nonfiction.content_infer import ContentPurposeInferer, ContentAnalysis
 from opus_orchestrator.nonfiction_taxonomy import (
     select_framework,
     get_frameworks_for_purpose,
@@ -43,7 +45,12 @@ class IntakeInput:
     target_audience: str = ""
     intended_outcome: str = ""
     
-    # Option 3: Previous Q&A (if conversational)
+    # Option 3: Existing content (for inference)
+    content: str = ""
+    content_title: str = ""
+    blog_posts: list = field(default_factory=list)
+    
+    # Option 4: Previous Q&A (if conversational)
     answers: dict[str, str] = field(default_factory=dict)
 
 
@@ -55,7 +62,9 @@ class IntakeResult:
     category: Optional[NonfictionCategory]
     framework: dict
     reasoning: str
-    source: str  # "explicit" | "classifier" | "intake" | "hybrid"
+    source: str  # "explicit" | "classifier" | "content" | "hybrid"
+    content_analysis: Optional[ContentAnalysis] = None
+    all_signals: dict = field(default_factory=dict)
 
 
 class IntakeAgent:
@@ -102,13 +111,23 @@ class IntakeAgent:
         ],
     }
     
+    # Content inference
+    CONTENT_INFERENCE_WEIGHT = 0.4  # Weight for content-based inference
+    
     def __init__(self, llm_client=None):
         self.classifier = PurposeClassifier(llm_client)
+        self.content_inferer = ContentPurposeInferer()
         self.llm_client = llm_client
     
     async def process(self, intake: IntakeInput, mode: IntakeMode = IntakeMode.AUTO) -> IntakeResult:
         """Process intake and determine purpose and framework.
         
+        All signals are weighted:
+        1. Explicit flags (weight: 1.0) - highest priority
+        2. Content inference (weight: 0.4) - from existing blog/posts
+        3. Keyword classification (weight: 0.3) - from concept
+        4. Conversational (weight: 0.5) - from Q&A
+        
         Args:
             intake: All available input signals
             mode: How to resolve (conversational, auto, explicit)
@@ -116,31 +135,98 @@ class IntakeAgent:
         Returns:
             IntakeResult with purpose, framework, and reasoning
         """
+        signals = {}  # Track all signals for reasoning
+        
         # Step 1: Check explicit flags (highest priority)
         if intake.explicit_purpose:
             return self._process_explicit(intake)
         
-        # Step 2: Use classifier for clear cases
         if mode == IntakeMode.EXPLICIT:
             return self._need_more_info(intake)
         
-        # Step 3: Auto-classify from concept
+        # Step 2: Content inference (if content provided)
+        content_result = None
+        if intake.content or intake.blog_posts:
+            if intake.blog_posts:
+                content_result = self.content_inferer.infer_from_blog(intake.blog_posts)
+            elif intake.content:
+                content_result = self.content_inferer.analyze(
+                    intake.content, 
+                    title=intake.content_title
+                )
+            signals["content"] = content_result
+        
+        # Step 3: Keyword classification from concept
         classifier_result = self.classifier._keyword_classify(
             concept=intake.concept,
             target_audience=intake.target_audience,
             intended_outcome=intake.intended_outcome,
         )
+        signals["concept"] = classifier_result
         
-        # If high confidence, use it
-        if classifier_result.confidence >= 0.7:
-            return self._build_result_from_classification(intake, classifier_result, "classifier")
+        # Step 4: WEIGHTED DECISION - combine signals
+        purpose_scores: dict[ReaderPurpose, float] = {p: 0.0 for p in ReaderPurpose}
         
-        # Step 4: If conversational and low confidence, ask questions
-        if mode == IntakeMode.CONVERSATIONAL and classifier_result.confidence < 0.5:
+        # Add content inference (if available)
+        if content_result and content_result.confidence > 0.3:
+            purpose_scores[content_result.purpose] += (
+                content_result.confidence * self.CONTENT_INFERENCE_WEIGHT
+            )
+        
+        # Add classifier result
+        purpose_scores[classifier_result.purpose] += (
+            classifier_result.confidence * 0.3
+        )
+        
+        # Find winning purpose
+        best_purpose = max(purpose_scores, key=purpose_scores.get)
+        best_score = purpose_scores[best_purpose]
+        
+        # Calculate final confidence
+        confidence = min(0.95, best_score)
+        
+        # If confidence is low and in conversational mode, ask questions
+        if confidence < 0.4 and mode == IntakeMode.CONVERSATIONAL:
             return self._need_more_info(intake)
         
-        # Step 5: Fall back to classification even with medium confidence
-        return self._build_result_from_classification(intake, classifier_result, "classifier")
+        # Determine source
+        if content_result and content_result.confidence > 0.5:
+            source = "content"
+        elif content_result and classifier_result.confidence > 0.3:
+            source = "hybrid"
+        else:
+            source = "classifier"
+        
+        # Get category from input
+        category = None
+        if intake.explicit_category:
+            try:
+                category = NonfictionCategory(intake.explicit_category.lower())
+            except ValueError:
+                pass
+        
+        # Select framework
+        framework = select_framework(
+            purpose=best_purpose,
+            category=category,
+        )
+        
+        # Build reasoning
+        reasons = []
+        if content_result:
+            reasons.append(f"content: {content_result.reasoning}")
+        reasons.append(f"concept: {classifier_result.reasoning}")
+        
+        return IntakeResult(
+            purpose=best_purpose,
+            confidence=confidence,
+            category=category,
+            framework=framework,
+            reasoning=" | ".join(reasons),
+            source=source,
+            content_analysis=content_result,
+            all_signals=signals,
+        )
     
     def _process_explicit(self, intake: IntakeInput) -> IntakeResult:
         """Process when user provided explicit purpose."""
@@ -171,6 +257,8 @@ class IntakeAgent:
             framework=framework,
             reasoning=f"Explicit user selection: {intake.explicit_purpose}",
             source="explicit",
+            content_analysis=None,
+            all_signals={"explicit": intake.explicit_purpose},
         )
     
     def _process_auto(self, intake: IntakeInput) -> IntakeResult:
@@ -224,6 +312,8 @@ class IntakeAgent:
             framework=select_framework(purpose=ReaderPurpose.UNDERSTAND),
             reasoning="Input ambiguous - defaulted to UNDERSTAND. Use --purpose flag for explicit selection.",
             source="intake",
+            content_analysis=None,
+            all_signals={},
         )
     
     def get_questions(self, purpose: Optional[ReaderPurpose] = None) -> list[str]: