feat: Content-Based Purpose Inference

Added content_infer.py - analyzes existing content to infer purpose: - ContentPurposeInferer class - Analyzes blog posts, articles, text - Detects signals: tutorials, explainers, transformation stories, etc. - Returns purpose, confidence, reasoning Updated intake.py to weight all signals: 1. Explicit flags (weight: 1.0) 2. Content inference (weight: 0.4) - NEW 3. Keyword classification (weight: 0.3) 4. Conversational (weight: 0.5) Now if you point at a blog: - Tutorial posts → LEARN_HANDS_ON - Explainers → UNDERSTAND - Transformation stories → TRANSFORM - Reviews/Comparisons → DECIDE - Reference docs → REFERENCE - Journey/Biography → BE_INSPIRED
2026-03-13 20:50:36 +00:00
parent b46e87ff76
commit 8cf833c729
3 changed files with 320 additions and 12 deletions
@@ -3,6 +3,7 @@
 Key components:
 - classifier: Classifies user input into ReaderPurpose
 - intake: Conversational intake agent for high-fidelity intent
 - content_infer: Infers purpose from existing blog/content
 """
 from opus_orchestrator.nonfiction.classifier import (
@@ -18,6 +19,11 @@ from opus_orchestrator.nonfiction.intake import (
    IntakeMode,
    determine_intake,
 )
 from opus_orchestrator.nonfiction.content_infer import (
    ContentPurposeInferer,
    ContentAnalysis,
    infer_purpose_from_content,
 )
 __all__ = [
    # Classifier
@@ -31,4 +37,8 @@ __all__ = [
    "IntakeResult",
    "IntakeMode",
    "determine_intake",
    # Content Inference
    "ContentPurposeInferer",
    "ContentAnalysis",
    "infer_purpose_from_content",
 ]
@@ -0,0 +1,208 @@
 """Content-Based Purpose Inference.
 Analyzes existing content to infer the reader purpose.
 This allows the system to determine purpose from blog posts, articles, etc.
 """
 from dataclasses import dataclass
 from typing import Optional
 from opus_orchestrator.nonfiction.classifier import ReaderPurpose
@dataclass
 class ContentAnalysis:
    """Result of analyzing content for purpose."""
    purpose: ReaderPurpose
    confidence: float
    reasoning: str
    signals: dict
 class ContentPurposeInferer:
    """Infers reader purpose from existing content.
    Analyzes blog posts, articles, or other content to determine
    what kind of book this content would become.
    """
    # Content patterns that indicate purpose
    CONTENT_SIGNALS = {
        ReaderPurpose.LEARN_HANDS_ON: {
            "indicators": [
                "step by step", "how to", "tutorial", "guide to",
                "instructions", "learn to", "course", "workshop",
                "example code", "exercise", "practice", "build a",
                "create a", "implement", "getting started",
            ],
            "structure": ["step", "chapter", "lesson", "module", "exercise"],
        },
        ReaderPurpose.UNDERSTAND: {
            "indicators": [
                "why", "explains", "understand", "concept of",
                "the nature of", "how it works", "mechanism",
                "deep dive", "analysis", "framework", "principles",
                "mental model", "theory", "psychology", "science",
            ],
            "structure": ["overview", "background", "core concepts", "implications"],
        },
        ReaderPurpose.TRANSFORM: {
            "indicators": [
                "i was", "i became", "my journey", "transformation",
                "overcoming", "struggle", "breakthrough", "changed my life",
                "how i", "from", "to", "becoming", "awakening",
                "healing", "recovery", "manifest", "empower",
            ],
            "structure": ["before", "after", "journey", "struggle", "triumph"],
        },
        ReaderPurpose.DECIDE: {
            "indicators": [
                "compared to", "versus", "pros and cons", "should you",
                "which is better", "is it worth", "decision", "choose",
                "analysis", "recommendation", "best", "top", "ranking",
                "tradeoff", "evaluation", "case study",
            ],
            "structure": ["comparison", "versus", "pros", "cons", "verdict"],
        },
        ReaderPurpose.REFERENCE: {
            "indicators": [
                "reference", "documentation", "api", "specification",
                "manual", "handbook", "comprehensive", "complete guide",
                "all about", "definitive", "index", "table of contents",
            ],
            "structure": ["reference", "api", "syntax", "parameters", "examples"],
        },
        ReaderPurpose.BE_INSPIRED: {
            "indicators": [
                "story", "journey", "triumph", "against all odds",
                "inspiration", "motivation", "life lesson", "wisdom",
                "legacy", "calling", "warrior", "hero", "unstoppable",
            ],
            "structure": ["chapter one", "the beginning", "the end", "epilogue"],
        },
    }
    # Negative signals (reduce confidence)
    NEGATION_PATTERNS = {
        ReaderPurpose.LEARN_HANDS_ON: ["theory", "why", "explain", "concept"],
        ReaderPurpose.TRANSFORM: ["reference", "documentation", "api"],
    }
    def analyze(
        self,
        content: str,
        title: str = "",
        meta_description: str = "",
    ) -> ContentAnalysis:
        """Analyze content to infer purpose.
        Args:
            content: The text content to analyze
            title: Title of the content
            meta_description: Meta description if available
        Returns:
            ContentAnalysis with inferred purpose
        """
        # Combine all text
        full_text = f"{title} {meta_description} {content}".lower()
        # Score each purpose
        scores: dict[ReaderPurpose, float] = {p: 0.0 for p in ReaderPurpose}
        signal_counts: dict[ReaderPurpose, list[str]] = {p: [] for p in ReaderPurpose}
        for purpose, patterns in self.CONTENT_SIGNALS.items():
            # Count indicator matches
            for indicator in patterns["indicators"]:
                if indicator.lower() in full_text:
                    scores[purpose] += 1.0
                    signal_counts[purpose].append(indicator)
            # Check structure patterns
            for structure in patterns.get("structure", []):
                if structure.lower() in full_text:
                    scores[purpose] += 0.5
        # Apply negations (reduce scores)
        for purpose, negations in self.NEGATION_PATTERNS.items():
            for negation in negations:
                if negation.lower() in full_text:
                    scores[purpose] = max(0, scores[purpose] - 0.5)
        # Normalize scores
        total_score = sum(scores.values())
        if total_score > 0:
            normalized = {p: s / total_score for p, s in scores.items()}
        else:
            normalized = {p: 0.1 for p in ReaderPurpose}  # Uniform if no matches
        # Find best match
        best_purpose = max(normalized, key=normalized.get)
        best_score = normalized[best_purpose]
        # Calculate confidence
        if best_score > 0.5:
            confidence = min(0.95, 0.5 + best_score * 0.5)
        elif best_score > 0.2:
            confidence = min(0.7, 0.3 + best_score * 0.4)
        else:
            confidence = 0.3
        # Build reasoning
        signals = signal_counts[best_purpose]
        if signals:
            reasoning = f"Content signals: {', '.join(signals[:5])}"
        else:
            reasoning = "No strong signals - purpose unclear"
        return ContentAnalysis(
            purpose=best_purpose,
            confidence=confidence,
            reasoning=reasoning,
            signals={p.value: c for p, c in signal_counts.items() if c},
        )
    def infer_from_blog(self, blog_posts: list[dict]) -> ContentAnalysis:
        """Infer purpose from multiple blog posts.
        Args:
            blog_posts: List of dicts with 'title', 'content', 'excerpt'
        Returns:
            Aggregated ContentAnalysis
        """
        all_text = ""
        titles = []
        for post in blog_posts:
            all_text += post.get("content", "") + " "
            all_text += post.get("excerpt", "") + " "
            titles.append(post.get("title", ""))
        result = self.analyze(all_text, title="; ".join(titles))
        # If multiple posts, boost confidence slightly
        if len(blog_posts) > 3:
            result.confidence = min(0.95, result.confidence + 0.1)
        return result
 # Convenience function
 def infer_purpose_from_content(
    content: str,
    title: str = "",
    meta_description: str = "",
 ) -> ContentAnalysis:
    """Convenience function to infer purpose from content.
    Args:
        content: The text content
        title: Title of the content
        meta_description: Optional meta description
    Returns:
        ContentAnalysis with inferred purpose
    """
    inferer = ContentPurposeInferer()
    return inferer.analyze(content, title, meta_description)
@@ -6,7 +6,8 @@ by asking clarifying questions or using available signals.
 This agent intelligently combines:
 1. Explicit user flags (--purpose learn)
 2. Keyword classification from concept
-3. Conversational intake (asking questions)
+3. Content inference from existing blog/posts
 4. Conversational intake (asking questions)
 The agent weights all inputs to make the best decision.
 """
@@ -16,6 +17,7 @@ from enum import Enum
 from typing import Optional
 from opus_orchestrator.nonfiction.classifier import PurposeClassifier, ReaderPurpose
 from opus_orchestrator.nonfiction.content_infer import ContentPurposeInferer, ContentAnalysis
 from opus_orchestrator.nonfiction_taxonomy import (
    select_framework,
    get_frameworks_for_purpose,
@@ -43,7 +45,12 @@ class IntakeInput:
    target_audience: str = ""
    intended_outcome: str = ""
-    # Option 3: Previous Q&A (if conversational)
+    # Option 3: Existing content (for inference)
    content: str = ""
    content_title: str = ""
    blog_posts: list = field(default_factory=list)
    # Option 4: Previous Q&A (if conversational)
    answers: dict[str, str] = field(default_factory=dict)
@@ -55,7 +62,9 @@ class IntakeResult:
    category: Optional[NonfictionCategory]
    framework: dict
    reasoning: str
-    source: str  # "explicit" | "classifier" | "intake" | "hybrid"
+    source: str  # "explicit" | "classifier" | "content" | "hybrid"
    content_analysis: Optional[ContentAnalysis] = None
    all_signals: dict = field(default_factory=dict)
 class IntakeAgent:
@@ -102,13 +111,23 @@ class IntakeAgent:
        ],
    }
    # Content inference
    CONTENT_INFERENCE_WEIGHT = 0.4  # Weight for content-based inference
    def __init__(self, llm_client=None):
        self.classifier = PurposeClassifier(llm_client)
        self.content_inferer = ContentPurposeInferer()
        self.llm_client = llm_client
    async def process(self, intake: IntakeInput, mode: IntakeMode = IntakeMode.AUTO) -> IntakeResult:
        """Process intake and determine purpose and framework.
        All signals are weighted:
        1. Explicit flags (weight: 1.0) - highest priority
        2. Content inference (weight: 0.4) - from existing blog/posts
        3. Keyword classification (weight: 0.3) - from concept
        4. Conversational (weight: 0.5) - from Q&A
        Args:
            intake: All available input signals
            mode: How to resolve (conversational, auto, explicit)
@@ -116,31 +135,98 @@ class IntakeAgent:
        Returns:
            IntakeResult with purpose, framework, and reasoning
        """
        signals = {}  # Track all signals for reasoning
        # Step 1: Check explicit flags (highest priority)
        if intake.explicit_purpose:
            return self._process_explicit(intake)
        # Step 2: Use classifier for clear cases
        if mode == IntakeMode.EXPLICIT:
            return self._need_more_info(intake)
-        # Step 3: Auto-classify from concept
+        # Step 2: Content inference (if content provided)
        content_result = None
        if intake.content or intake.blog_posts:
            if intake.blog_posts:
                content_result = self.content_inferer.infer_from_blog(intake.blog_posts)
            elif intake.content:
                content_result = self.content_inferer.analyze(
                    intake.content, 
                    title=intake.content_title
                )
            signals["content"] = content_result
        # Step 3: Keyword classification from concept
        classifier_result = self.classifier._keyword_classify(
            concept=intake.concept,
            target_audience=intake.target_audience,
            intended_outcome=intake.intended_outcome,
        )
        signals["concept"] = classifier_result
-        # If high confidence, use it
+        # Step 4: WEIGHTED DECISION - combine signals
-        if classifier_result.confidence >= 0.7:
+        purpose_scores: dict[ReaderPurpose, float] = {p: 0.0 for p in ReaderPurpose}
            return self._build_result_from_classification(intake, classifier_result, "classifier")
-        # Step 4: If conversational and low confidence, ask questions
+        # Add content inference (if available)
-        if mode == IntakeMode.CONVERSATIONAL and classifier_result.confidence < 0.5:
+        if content_result and content_result.confidence > 0.3:
            purpose_scores[content_result.purpose] += (
                content_result.confidence * self.CONTENT_INFERENCE_WEIGHT
            )
        # Add classifier result
        purpose_scores[classifier_result.purpose] += (
            classifier_result.confidence * 0.3
        )
        # Find winning purpose
        best_purpose = max(purpose_scores, key=purpose_scores.get)
        best_score = purpose_scores[best_purpose]
        # Calculate final confidence
        confidence = min(0.95, best_score)
        # If confidence is low and in conversational mode, ask questions
        if confidence < 0.4 and mode == IntakeMode.CONVERSATIONAL:
            return self._need_more_info(intake)
-        # Step 5: Fall back to classification even with medium confidence
+        # Determine source
-        return self._build_result_from_classification(intake, classifier_result, "classifier")
+        if content_result and content_result.confidence > 0.5:
            source = "content"
        elif content_result and classifier_result.confidence > 0.3:
            source = "hybrid"
        else:
            source = "classifier"
        # Get category from input
        category = None
        if intake.explicit_category:
            try:
                category = NonfictionCategory(intake.explicit_category.lower())
            except ValueError:
                pass
        # Select framework
        framework = select_framework(
            purpose=best_purpose,
            category=category,
        )
        # Build reasoning
        reasons = []
        if content_result:
            reasons.append(f"content: {content_result.reasoning}")
        reasons.append(f"concept: {classifier_result.reasoning}")
        return IntakeResult(
            purpose=best_purpose,
            confidence=confidence,
            category=category,
            framework=framework,
            reasoning=" | ".join(reasons),
            source=source,
            content_analysis=content_result,
            all_signals=signals,
        )
    def _process_explicit(self, intake: IntakeInput) -> IntakeResult:
        """Process when user provided explicit purpose."""
@@ -171,6 +257,8 @@ class IntakeAgent:
            framework=framework,
            reasoning=f"Explicit user selection: {intake.explicit_purpose}",
            source="explicit",
            content_analysis=None,
            all_signals={"explicit": intake.explicit_purpose},
        )
    def _process_auto(self, intake: IntakeInput) -> IntakeResult:
@@ -224,6 +312,8 @@ class IntakeAgent:
            framework=select_framework(purpose=ReaderPurpose.UNDERSTAND),
            reasoning="Input ambiguous - defaulted to UNDERSTAND. Use --purpose flag for explicit selection.",
            source="intake",
            content_analysis=None,
            all_signals={},
        )
    def get_questions(self, purpose: Optional[ReaderPurpose] = None) -> list[str]: