feat: Issue #18 - Purpose Classifier for Nonfiction

- Created opus_orchestrator/nonfiction/classifier.py - PurposeClassifier class with keyword-based classification - LLM-enhanced classification (optional) - ReaderPurpose enum (6 purposes) - ClassificationResult dataclass - Keyword classification covers: - LEARN_HANDS_ON: how to, learn to, tutorial, skills, etc. - UNDERSTAND: understand, why, concept, mental model, etc. - TRANSFORM: change, become, improve, habits, etc. - DECIDE: decide, choose, compare, vs, analysis - REFERENCE: manual, handbook, comprehensive, API - BE_INSPIRED: inspire, story, journey, biography - Tests pass for all 6 purposes with high confidence This is the foundation for the entire nonfiction pipeline (Issue #18).
2026-03-13 20:15:20 +00:00
parent d98ef622d9
commit 4b4addedf7
3 changed files with 552 additions and 0 deletions
@@ -0,0 +1,19 @@
+"""Nonfiction submodule for Opus Orchestrator.
+
+Key components:
+- classifier: Classifies user input into ReaderPurpose
+"""
+
+from opus_orchestrator.nonfiction.classifier import (
+    PurposeClassifier,
+    ClassificationResult,
+    classify_purpose,
+    ReaderPurpose,
+)
+
+__all__ = [
+    "PurposeClassifier",
+    "ClassificationResult", 
+    "classify_purpose",
+    "ReaderPurpose",
+]
@@ -0,0 +1,266 @@
+"""Purpose Classifier for Nonfiction Books.
+
+Classifies user input into ReaderPurpose - why the reader will be reading this book.
+This is the foundation for the entire nonfiction pipeline.
+
+Usage:
+    from opus_orchestrator.nonfiction.classifier import PurposeClassifier, ReaderPurpose
+    
+    classifier = PurposeClassifier()
+    result = await classifier.classify(
+        concept="Leadership for introverts",
+        target_audience="Introverted professionals who want to develop leadership skills",
+        intended_outcome="Learn to lead with quiet confidence"
+    )
+    
+    print(result.purpose)       # ReaderPurpose.TRANSFORM
+    print(result.confidence)    # 0.87
+    print(result.reasoning)    # "Target audience wants 'develop' - indicates self-transformation"
+"""
+
+import re
+import json
+from dataclasses import dataclass
+from enum import Enum
+from typing import Optional
+
+
+class ReaderPurpose(str, Enum):
+    """Why is the reader reading this book?"""
+    LEARN_HANDS_ON = "learn_hands_on"
+    UNDERSTAND = "understand"
+    TRANSFORM = "transform"
+    DECIDE = "decide"
+    REFERENCE = "reference"
+    BE_INSPIRED = "be_inspired"
+
+
+@dataclass
+class ClassificationResult:
+    """Result of purpose classification."""
+    purpose: ReaderPurpose
+    confidence: float
+    reasoning: str
+    alternative_purposes: Optional[list] = None
+
+
+class PurposeClassifier:
+    """Classifies user input into ReaderPurpose.
+    
+    Uses keyword-based classification with optional LLM enhancement.
+    """
+    
+    PURPOSE_KEYWORDS = {
+        ReaderPurpose.LEARN_HANDS_ON: [
+            "how to", "how-to", "learn to", "master", "step by step",
+            "beginner's guide", "tutorial", "practical", "hands-on",
+            "skills", "do it yourself", "build", "create", "make",
+            "implement", "develop skills", "learn skills", "course",
+            "workshop", "training", "teach yourself", "guide to",
+            "becoming", "learn the basics", "fundamentals",
+        ],
+        ReaderPurpose.UNDERSTAND: [
+            "understand", "why", "how it works", "explain", "concept",
+            "mental model", "deep dive", "exploration", "the nature of",
+            "the truth about", "what is", "meaning", "philosophy",
+            "theory", "framework", "principles", "inside story",
+            "real story", "hidden", "secret", "science of",
+            "psychology of", "the way", "essence", "sapiens",
+        ],
+        ReaderPurpose.TRANSFORM: [
+            "transform", "change", "become", "develop", "improve",
+            "better", "overcome", "heal", "grow", "personal growth",
+            "self-improvement", "self help", "empower", "breakthrough",
+            "awakening", "journey", "awaken", "reinvent",
+            "reclaim", "freedom", "love yourself", "healing",
+            "recovery", "manifest", "attract", "abundance",
+            "habits", "routines", "mindset", "productivity",
+        ],
+        ReaderPurpose.DECIDE: [
+            "decide", "choose", "compare", "vs", "versus",
+            "which is better", "pros and cons", "trade-off", "decision",
+            "guide", "strategies", "strategy", "choosing", "selecting",
+            "investment", "where to put", "how to allocate", "prioritize",
+            "business case", "roi", "worth it", "should i", "analysis",
+        ],
+        ReaderPurpose.REFERENCE: [
+            "reference", "manual", "handbook", "dictionary", "encyclopedia",
+            "comprehensive", "complete guide", "all about", "definitive",
+            "bible", "catalog", "directory", "index", "lookup",
+            "specification", "documentation", "api", "technical",
+            "architecture", "system design", "best practices",
+        ],
+        ReaderPurpose.BE_INSPIRED: [
+            "inspire", "motivational", "biography", "memoir", "story",
+            "life", "journey", "triumph", "overcoming", "against all odds",
+            "unstoppable", "dream", "vision", "legacy", "purpose",
+            "calling", "warrior", "hero", "legend", "icon",
+        ],
+    }
+    
+    PURPOSE_NEGATIONS = {
+        ReaderPurpose.LEARN_HANDS_ON: ["understand", "explain", "why", "concept"],
+        ReaderPurpose.TRANSFORM: ["reference", "manual", "tutorial"],
+        ReaderPurpose.UNDERSTAND: ["how to", "step by step", "tutorial"],
+    }
+    
+    def __init__(self, llm_client=None):
+        self.llm_client = llm_client
+    
+    async def classify(
+        self,
+        concept: str,
+        target_audience: str = "",
+        intended_outcome: str = "",
+    ) -> ClassificationResult:
+        """Classify user input into ReaderPurpose."""
+        keyword_result = self._keyword_classify(concept, target_audience, intended_outcome)
+        
+        if keyword_result.confidence >= 0.8:
+            return keyword_result
+        
+        if self.llm_client:
+            try:
+                llm_result = await self._llm_classify(concept, target_audience, intended_outcome)
+                if llm_result.confidence > keyword_result.confidence:
+                    return llm_result
+            except Exception:
+                pass
+        
+        return keyword_result
+    
+    def _keyword_classify(
+        self,
+        concept: str,
+        target_audience: str,
+        intended_outcome: str,
+    ) -> ClassificationResult:
+        """Fast keyword-based classification."""
+        text = f"{concept} {target_audience} {intended_outcome}".lower()
+        
+        scores = {p: 0 for p in ReaderPurpose}
+        
+        for purpose, keywords in self.PURPOSE_KEYWORDS.items():
+            for keyword in keywords:
+                if keyword.lower() in text:
+                    scores[purpose] += 1
+        
+        for purpose, negations in self.PURPOSE_NEGATIONS.items():
+            for negation in negations:
+                if negation.lower() in text:
+                    scores[purpose] = max(0, scores[purpose] - 1)
+        
+        if max(scores.values()) == 0:
+            return ClassificationResult(
+                purpose=ReaderPurpose.UNDERSTAND,
+                confidence=0.3,
+                reasoning="No clear purpose keywords found, defaulting to UNDERSTAND",
+            )
+        
+        sorted_purposes = sorted(scores.items(), key=lambda x: x[1], reverse=True)
+        top_purpose, top_score = sorted_purposes[0]
+        
+        total_score = sum(1 for s in scores.values() if s > 0)
+        confidence = min(0.95, top_score / max(1, total_score)) if total_score > 0 else 0.3
+        
+        matched_keywords = [kw for kw in self.PURPOSE_KEYWORDS[top_purpose] 
+                         if kw.lower() in text]
+        
+        return ClassificationResult(
+            purpose=top_purpose,
+            confidence=confidence,
+            reasoning=f"Keywords matched: {', '.join(matched_keywords[:5])}",
+        )
+    
+    async def _llm_classify(
+        self,
+        concept: str,
+        target_audience: str,
+        intended_outcome: str,
+    ) -> ClassificationResult:
+        """LLM-based classification."""
+        prompt = f"""Analyze this book concept and determine WHY a reader would read this book.
+
+## Input
+- Concept/Title: {concept}
+- Target Audience: {target_audience or '(not specified)'}
+- Intended Outcome: {intended_outcome or '(not specified)'}
+
+## Options
+1. LEARN_HANDS_ON: Reader wants to DO something specific
+2. UNDERSTAND: Reader wants to GRASP a concept deeply  
+3. TRANSFORM: Reader wants to CHANGE themselves
+4. DECIDE: Reader wants to make an informed decision
+5. REFERENCE: Reader wants to LOOK UP information
+6. BE_INSPIRED: Reader wants to feel motivated
+
+## Output Format (JSON only)
+{{
+  "purpose": "one of: learn_hands_on, understand, transform, decide, reference, be_inspired",
+  "confidence": 0.0 to 1.0,
+  "reasoning": "1-2 sentences explaining why"
+}}
+
+Analyze:"""
+
+        result = await self.llm_client.complete_async(
+            system_prompt="You are a book categorization system. Return ONLY valid JSON.",
+            user_prompt=prompt,
+            temperature=0.3,
+            max_tokens=500,
+        )
+        
+        return self._parse_llm_result(result)
+    
+    def _parse_llm_result(self, result: str) -> ClassificationResult:
+        """Parse LLM response."""
+        try:
+            if "```json" in result:
+                json_str = result.split("```json")[1].split("```")[0]
+            elif "```" in result:
+                json_str = result.split("```")[1].split("```")[0]
+            else:
+                start, end = result.find("{"), result.rfind("}") + 1
+                if start >= 0 and end > start:
+                    json_str = result[start:end]
+                else:
+                    raise ValueError("No JSON found")
+            
+            data = json.loads(json_str)
+            
+            purpose_map = {
+                "learn_hands_on": ReaderPurpose.LEARN_HANDS_ON,
+                "learn": ReaderPurpose.LEARN_HANDS_ON,
+                "understand": ReaderPurpose.UNDERSTAND,
+                "transform": ReaderPurpose.TRANSFORM,
+                "decide": ReaderPurpose.DECIDE,
+                "reference": ReaderPurpose.REFERENCE,
+                "be_inspired": ReaderPurpose.BE_INSPIRED,
+                "be inspired": ReaderPurpose.BE_INSPIRED,
+            }
+            
+            purpose_str = data.get("purpose", "").lower()
+            purpose = purpose_map.get(purpose_str, ReaderPurpose.UNDERSTAND)
+            
+            return ClassificationResult(
+                purpose=purpose,
+                confidence=float(data.get("confidence", 0.7)),
+                reasoning=data.get("reasoning", "LLM classification"),
+            )
+        except (json.JSONDecodeError, ValueError) as e:
+            return ClassificationResult(
+                purpose=ReaderPurpose.UNDERSTAND,
+                confidence=0.3,
+                reasoning=f"LLM parse failed, defaulting to UNDERSTAND",
+            )
+
+
+async def classify_purpose(
+    concept: str,
+    target_audience: str = "",
+    intended_outcome: str = "",
+    llm_client=None,
+) -> ClassificationResult:
+    """Convenience function to classify purpose."""
+    classifier = PurposeClassifier(llm_client)
+    return await classifier.classify(concept, target_audience, intended_outcome)