4b4addedf7
- Created opus_orchestrator/nonfiction/classifier.py - PurposeClassifier class with keyword-based classification - LLM-enhanced classification (optional) - ReaderPurpose enum (6 purposes) - ClassificationResult dataclass - Keyword classification covers: - LEARN_HANDS_ON: how to, learn to, tutorial, skills, etc. - UNDERSTAND: understand, why, concept, mental model, etc. - TRANSFORM: change, become, improve, habits, etc. - DECIDE: decide, choose, compare, vs, analysis - REFERENCE: manual, handbook, comprehensive, API - BE_INSPIRED: inspire, story, journey, biography - Tests pass for all 6 purposes with high confidence This is the foundation for the entire nonfiction pipeline (Issue #18).
267 lines
10 KiB
Python
267 lines
10 KiB
Python
"""Purpose Classifier for Nonfiction Books.
|
|
|
|
Classifies user input into ReaderPurpose - why the reader will be reading this book.
|
|
This is the foundation for the entire nonfiction pipeline.
|
|
|
|
Usage:
|
|
from opus_orchestrator.nonfiction.classifier import PurposeClassifier, ReaderPurpose
|
|
|
|
classifier = PurposeClassifier()
|
|
result = await classifier.classify(
|
|
concept="Leadership for introverts",
|
|
target_audience="Introverted professionals who want to develop leadership skills",
|
|
intended_outcome="Learn to lead with quiet confidence"
|
|
)
|
|
|
|
print(result.purpose) # ReaderPurpose.TRANSFORM
|
|
print(result.confidence) # 0.87
|
|
print(result.reasoning) # "Target audience wants 'develop' - indicates self-transformation"
|
|
"""
|
|
|
|
import re
|
|
import json
|
|
from dataclasses import dataclass
|
|
from enum import Enum
|
|
from typing import Optional
|
|
|
|
|
|
class ReaderPurpose(str, Enum):
|
|
"""Why is the reader reading this book?"""
|
|
LEARN_HANDS_ON = "learn_hands_on"
|
|
UNDERSTAND = "understand"
|
|
TRANSFORM = "transform"
|
|
DECIDE = "decide"
|
|
REFERENCE = "reference"
|
|
BE_INSPIRED = "be_inspired"
|
|
|
|
|
|
@dataclass
|
|
class ClassificationResult:
|
|
"""Result of purpose classification."""
|
|
purpose: ReaderPurpose
|
|
confidence: float
|
|
reasoning: str
|
|
alternative_purposes: Optional[list] = None
|
|
|
|
|
|
class PurposeClassifier:
|
|
"""Classifies user input into ReaderPurpose.
|
|
|
|
Uses keyword-based classification with optional LLM enhancement.
|
|
"""
|
|
|
|
PURPOSE_KEYWORDS = {
|
|
ReaderPurpose.LEARN_HANDS_ON: [
|
|
"how to", "how-to", "learn to", "master", "step by step",
|
|
"beginner's guide", "tutorial", "practical", "hands-on",
|
|
"skills", "do it yourself", "build", "create", "make",
|
|
"implement", "develop skills", "learn skills", "course",
|
|
"workshop", "training", "teach yourself", "guide to",
|
|
"becoming", "learn the basics", "fundamentals",
|
|
],
|
|
ReaderPurpose.UNDERSTAND: [
|
|
"understand", "why", "how it works", "explain", "concept",
|
|
"mental model", "deep dive", "exploration", "the nature of",
|
|
"the truth about", "what is", "meaning", "philosophy",
|
|
"theory", "framework", "principles", "inside story",
|
|
"real story", "hidden", "secret", "science of",
|
|
"psychology of", "the way", "essence", "sapiens",
|
|
],
|
|
ReaderPurpose.TRANSFORM: [
|
|
"transform", "change", "become", "develop", "improve",
|
|
"better", "overcome", "heal", "grow", "personal growth",
|
|
"self-improvement", "self help", "empower", "breakthrough",
|
|
"awakening", "journey", "awaken", "reinvent",
|
|
"reclaim", "freedom", "love yourself", "healing",
|
|
"recovery", "manifest", "attract", "abundance",
|
|
"habits", "routines", "mindset", "productivity",
|
|
],
|
|
ReaderPurpose.DECIDE: [
|
|
"decide", "choose", "compare", "vs", "versus",
|
|
"which is better", "pros and cons", "trade-off", "decision",
|
|
"guide", "strategies", "strategy", "choosing", "selecting",
|
|
"investment", "where to put", "how to allocate", "prioritize",
|
|
"business case", "roi", "worth it", "should i", "analysis",
|
|
],
|
|
ReaderPurpose.REFERENCE: [
|
|
"reference", "manual", "handbook", "dictionary", "encyclopedia",
|
|
"comprehensive", "complete guide", "all about", "definitive",
|
|
"bible", "catalog", "directory", "index", "lookup",
|
|
"specification", "documentation", "api", "technical",
|
|
"architecture", "system design", "best practices",
|
|
],
|
|
ReaderPurpose.BE_INSPIRED: [
|
|
"inspire", "motivational", "biography", "memoir", "story",
|
|
"life", "journey", "triumph", "overcoming", "against all odds",
|
|
"unstoppable", "dream", "vision", "legacy", "purpose",
|
|
"calling", "warrior", "hero", "legend", "icon",
|
|
],
|
|
}
|
|
|
|
PURPOSE_NEGATIONS = {
|
|
ReaderPurpose.LEARN_HANDS_ON: ["understand", "explain", "why", "concept"],
|
|
ReaderPurpose.TRANSFORM: ["reference", "manual", "tutorial"],
|
|
ReaderPurpose.UNDERSTAND: ["how to", "step by step", "tutorial"],
|
|
}
|
|
|
|
def __init__(self, llm_client=None):
|
|
self.llm_client = llm_client
|
|
|
|
async def classify(
|
|
self,
|
|
concept: str,
|
|
target_audience: str = "",
|
|
intended_outcome: str = "",
|
|
) -> ClassificationResult:
|
|
"""Classify user input into ReaderPurpose."""
|
|
keyword_result = self._keyword_classify(concept, target_audience, intended_outcome)
|
|
|
|
if keyword_result.confidence >= 0.8:
|
|
return keyword_result
|
|
|
|
if self.llm_client:
|
|
try:
|
|
llm_result = await self._llm_classify(concept, target_audience, intended_outcome)
|
|
if llm_result.confidence > keyword_result.confidence:
|
|
return llm_result
|
|
except Exception:
|
|
pass
|
|
|
|
return keyword_result
|
|
|
|
def _keyword_classify(
|
|
self,
|
|
concept: str,
|
|
target_audience: str,
|
|
intended_outcome: str,
|
|
) -> ClassificationResult:
|
|
"""Fast keyword-based classification."""
|
|
text = f"{concept} {target_audience} {intended_outcome}".lower()
|
|
|
|
scores = {p: 0 for p in ReaderPurpose}
|
|
|
|
for purpose, keywords in self.PURPOSE_KEYWORDS.items():
|
|
for keyword in keywords:
|
|
if keyword.lower() in text:
|
|
scores[purpose] += 1
|
|
|
|
for purpose, negations in self.PURPOSE_NEGATIONS.items():
|
|
for negation in negations:
|
|
if negation.lower() in text:
|
|
scores[purpose] = max(0, scores[purpose] - 1)
|
|
|
|
if max(scores.values()) == 0:
|
|
return ClassificationResult(
|
|
purpose=ReaderPurpose.UNDERSTAND,
|
|
confidence=0.3,
|
|
reasoning="No clear purpose keywords found, defaulting to UNDERSTAND",
|
|
)
|
|
|
|
sorted_purposes = sorted(scores.items(), key=lambda x: x[1], reverse=True)
|
|
top_purpose, top_score = sorted_purposes[0]
|
|
|
|
total_score = sum(1 for s in scores.values() if s > 0)
|
|
confidence = min(0.95, top_score / max(1, total_score)) if total_score > 0 else 0.3
|
|
|
|
matched_keywords = [kw for kw in self.PURPOSE_KEYWORDS[top_purpose]
|
|
if kw.lower() in text]
|
|
|
|
return ClassificationResult(
|
|
purpose=top_purpose,
|
|
confidence=confidence,
|
|
reasoning=f"Keywords matched: {', '.join(matched_keywords[:5])}",
|
|
)
|
|
|
|
async def _llm_classify(
|
|
self,
|
|
concept: str,
|
|
target_audience: str,
|
|
intended_outcome: str,
|
|
) -> ClassificationResult:
|
|
"""LLM-based classification."""
|
|
prompt = f"""Analyze this book concept and determine WHY a reader would read this book.
|
|
|
|
## Input
|
|
- Concept/Title: {concept}
|
|
- Target Audience: {target_audience or '(not specified)'}
|
|
- Intended Outcome: {intended_outcome or '(not specified)'}
|
|
|
|
## Options
|
|
1. LEARN_HANDS_ON: Reader wants to DO something specific
|
|
2. UNDERSTAND: Reader wants to GRASP a concept deeply
|
|
3. TRANSFORM: Reader wants to CHANGE themselves
|
|
4. DECIDE: Reader wants to make an informed decision
|
|
5. REFERENCE: Reader wants to LOOK UP information
|
|
6. BE_INSPIRED: Reader wants to feel motivated
|
|
|
|
## Output Format (JSON only)
|
|
{{
|
|
"purpose": "one of: learn_hands_on, understand, transform, decide, reference, be_inspired",
|
|
"confidence": 0.0 to 1.0,
|
|
"reasoning": "1-2 sentences explaining why"
|
|
}}
|
|
|
|
Analyze:"""
|
|
|
|
result = await self.llm_client.complete_async(
|
|
system_prompt="You are a book categorization system. Return ONLY valid JSON.",
|
|
user_prompt=prompt,
|
|
temperature=0.3,
|
|
max_tokens=500,
|
|
)
|
|
|
|
return self._parse_llm_result(result)
|
|
|
|
def _parse_llm_result(self, result: str) -> ClassificationResult:
|
|
"""Parse LLM response."""
|
|
try:
|
|
if "```json" in result:
|
|
json_str = result.split("```json")[1].split("```")[0]
|
|
elif "```" in result:
|
|
json_str = result.split("```")[1].split("```")[0]
|
|
else:
|
|
start, end = result.find("{"), result.rfind("}") + 1
|
|
if start >= 0 and end > start:
|
|
json_str = result[start:end]
|
|
else:
|
|
raise ValueError("No JSON found")
|
|
|
|
data = json.loads(json_str)
|
|
|
|
purpose_map = {
|
|
"learn_hands_on": ReaderPurpose.LEARN_HANDS_ON,
|
|
"learn": ReaderPurpose.LEARN_HANDS_ON,
|
|
"understand": ReaderPurpose.UNDERSTAND,
|
|
"transform": ReaderPurpose.TRANSFORM,
|
|
"decide": ReaderPurpose.DECIDE,
|
|
"reference": ReaderPurpose.REFERENCE,
|
|
"be_inspired": ReaderPurpose.BE_INSPIRED,
|
|
"be inspired": ReaderPurpose.BE_INSPIRED,
|
|
}
|
|
|
|
purpose_str = data.get("purpose", "").lower()
|
|
purpose = purpose_map.get(purpose_str, ReaderPurpose.UNDERSTAND)
|
|
|
|
return ClassificationResult(
|
|
purpose=purpose,
|
|
confidence=float(data.get("confidence", 0.7)),
|
|
reasoning=data.get("reasoning", "LLM classification"),
|
|
)
|
|
except (json.JSONDecodeError, ValueError) as e:
|
|
return ClassificationResult(
|
|
purpose=ReaderPurpose.UNDERSTAND,
|
|
confidence=0.3,
|
|
reasoning=f"LLM parse failed, defaulting to UNDERSTAND",
|
|
)
|
|
|
|
|
|
async def classify_purpose(
|
|
concept: str,
|
|
target_audience: str = "",
|
|
intended_outcome: str = "",
|
|
llm_client=None,
|
|
) -> ClassificationResult:
|
|
"""Convenience function to classify purpose."""
|
|
classifier = PurposeClassifier(llm_client)
|
|
return await classifier.classify(concept, target_audience, intended_outcome)
|