opus_orchestrator/agents/research.py

"""Research Agent for Opus Orchestrator.

Enhanced nonfiction agent with live research capabilities.
"""

import os
from typing import Any, Optional

from dotenv import load_dotenv


from opus_orchestrator.agents.base import BaseAgent, AgentResponse
from opus_orchestrator.utils.research import (
    ResearchOrchestrator,
    create_research_orchestrator,
    SearchTool,
    WikipediaTool,
    AcademicSearchTool,
)


# System prompt for research agent
RESEARCH_AGENT_SYSTEM_PROMPT = """## Role: Research Agent with Live Web Access

You are The Researcher — an AI agent with live access to the internet, academic databases, and research tools.

## Your Capabilities

1. **Web Search** - Search the current web for latest information
2. **Wikipedia** - Access encyclopedic knowledge
3. **Academic Search** - Find peer-reviewed papers (CrossRef, Semantic Scholar)
4. **Innovation Detection** - Identify gaps and new ideas beyond training data

## Your Mission

NOT just verify facts — **DISCOVER new information, trends, and innovations**.

- Find what's NEW since your training cutoff
- Identify research gaps and opportunities  
- Connect disparate ideas into novel insights
- Go beyond what you "know" to what you can FIND

## Research Process

1. **Explore** - Broad search on topic
2. **Deep Dive** - Specific searches on subtopics
3. **Cross-Reference** - Find connections between sources
4. **Innovate** - Generate original insights beyond training data

## Output Format

Provide your research in this structure:

```
## Findings (What you discovered)
- [New information 1]
- [New information 2]
- [Latest developments]

## Sources (Where you found it)
- [URL 1]: [Title]
- [URL 2]: [Title]

## Innovations (Original insights beyond training data)
- [Novel connection 1]
- [Novel connection 2]

## Research Gaps (What's not well-covered)
- [Gap 1]
- [Gap 2]
```

## Remember

You're not just fact-checking — you're RESEARCHING. Actively seek new information, 
challenge assumptions, and generate original ideas. This keeps the content fresh 
and prevents "AI slop" from repetitive training data patterns.
"""


class ResearchAgent(BaseAgent):
    """Enhanced research agent with live web access and innovation detection."""
    
    def __init__(
        self,
        config=None,
        search_provider: str = "tavily",
        use_wikipedia: bool = True,
        use_academic: bool = True,
    ):
        """Initialize research agent with tools.
        
        Args:
            config: Agent configuration
            search_provider: Search provider (tavily, serper, brave, duckduckgo)
            use_wikipedia: Include Wikipedia search
            use_academic: Include academic search
        """
        # Initialize research tools
        self.research = create_research_orchestrator(
            search_provider=search_provider,
            use_wikipedia=use_wikipedia,
            use_academic=use_academic,
        )
        
        self.search_tool = SearchTool(provider=search_provider)
        self.wikipedia = WikipediaTool() if use_wikipedia else None
        self.academic = AcademicSearchTool() if use_academic else None
        
        super().__init__(
            role="Research Agent",
            description="Live web research with innovation detection",
            system_prompt=RESEARCH_AGENT_SYSTEM_PROMPT,
            config=config,
        )

    async def execute(self, input_data: Any, context: dict[str, Any]) -> AgentResponse:
        """Execute research task with live tools.
        
        Args:
            input_data: Research query and parameters
            context: Additional context
            
        Returns:
            Research findings with sources and innovations
        """
        # Extract query
        if isinstance(input_data, dict):
            query = input_data.get("query", "")
            subtopics = input_data.get("subtopics", [])
            deep = input_data.get("deep_research", False)
        else:
            query = str(input_data)
            subtopics = []
            deep = False
        
        if not query:
            return AgentResponse(
                success=False,
                output=None,
                error="No research query provided",
                metadata={"role": "Research Agent"},
            )
        
        try:
            # Perform research
            if deep or subtopics:
                # Deep research with subtopics
                results = self.research.deep_research(query, subtopics)
            else:
                # Quick comprehensive search
                results = self.research.comprehensive_search(query)
            
            # Format results for LLM
            research_summary = self._format_research_for_llm(results)
            
            # Use LLM to synthesize and provide analysis
            synthesis = await self.call_llm(
                system_prompt=self.build_system_prompt(context),
                user_prompt=f"""Based on this research data, provide analysis and insights:

{research_summary}

Task: {query}

Provide:
1. Key findings synthesized
2. Most important innovations/discoveries
3. How this goes beyond typical training data
4. Recommendations for the manuscript""",
            )
            
            return AgentResponse(
                success=True,
                output={
                    "raw_results": results,
                    "synthesis": synthesis,
                    "query": query,
                },
                metadata={
                    "role": "Research Agent",
                    "search_provider": self.research.search.provider,
                },
            )
            
        except Exception as e:
            return AgentResponse(
                success=False,
                output=None,
                error=f"Research failed: {str(e)}",
                metadata={"role": "Research Agent"},
            )
    
    def _format_research_for_llm(self, results: dict) -> str:
        """Format research results for LLM consumption."""
        output = []
        
        # Query
        output.append(f"# Research Query: {results.get('query', '')}")
        output.append(f"Timestamp: {results.get('timestamp', '')}")
        output.append("")
        
        # Web results
        web = results.get("web", [])
        if web:
            output.append("## Web Search Results")
            for i, r in enumerate(web[:5], 1):
                output.append(f"{i}. **{r.get('title', '')}**")
                output.append(f"   URL: {r.get('url', '')}")
                output.append(f"   {r.get('content', '')[:200]}...")
                output.append("")
        
        # Wikipedia
        wiki = results.get("wikipedia", [])
        if wiki:
            output.append("## Wikipedia Results")
            for r in wiki[:3]:
                output.append(f"- {r.get('title', '')}: {r.get('summary', '')[:200]}...")
            output.append("")
        
        # Academic
        academic = results.get("academic", [])
        if academic:
            output.append("## Academic Papers")
            for r in academic[:5]:
                output.append(f"- {r.get('title', '')} ({r.get('year', 'N/A')})")
                output.append(f"  {r.get('journal', '')}")
            output.append("")
        
        # Innovations
        innovations = results.get("innovations", [])
        if innovations:
            output.append("## Innovations & New Ideas")
            for i in innovations:
                output.append(f"- {i}")
            output.append("")
        
        return "\n".join(output)


# Fact-checking with live verification
class VerifiedFactChecker:
    """Fact checker with live source verification."""
    
    def __init__(self, search_provider: str = "tavily"):
        """Initialize verified fact checker."""
        self.search = SearchTool(provider=search_provider)
        self.wikipedia = WikipediaTool()
    
    async def verify_claim(
        self,
        claim: str,
        context: str = "",
    ) -> dict:
        """Verify a factual claim against live sources.
        
        Args:
            claim: The claim to verify
            context: Additional context
            
        Returns:
            Verification result with confidence and sources
        """
        # Search for the claim
        results = self.search.search(claim, num_results=5)
        
        # Check Wikipedia
        wiki_results = self.wikipedia.search(claim, num_results=2)
        
        # Analyze
        supporting = []
        contradicting = []
        neutral = []
        
        for r in results:
            content = r.get("content", "").lower()
            claim_lower = claim.lower()
            
            # Simple keyword matching
            claim_words = set(claim_lower.split())
            content_words = set(content.split())
            overlap = claim_words & content_words
            
            if len(overlap) > len(claim_words) * 0.7:
                supporting.append(r)
            elif "not" in content or "false" in content or "incorrect" in content:
                contradicting.append(r)
            else:
                neutral.append(r)
        
        # Calculate confidence
        total = len(supporting) + len(contradicting) + len(neutral)
        if total == 0:
            confidence = 0.0
        else:
            confidence = len(supporting) / total
        
        return {
            "claim": claim,
            "verified": len(supporting) > 0,
            "confidence": confidence,
            "supporting_sources": supporting,
            "contradicting_sources": contradicting,
            "neutral_sources": neutral,
            "needs_citation": confidence < 0.8,
        }
    
    async def verify_batch(
        self,
        claims: list[str],
    ) -> list[dict]:
        """Verify multiple claims.
        
        Args:
            claims: List of claims to verify
            
        Returns:
            List of verification results
        """
        results = []
        for claim in claims:
            result = await self.verify_claim(claim)
            results.append(result)
        return results


def create_research_agent(
    search_provider: str = "tavily",
) -> ResearchAgent:
    """Factory to create a research agent.
    
    Args:
        search_provider: Search provider
        
    Returns:
        Configured ResearchAgent
    """
    return ResearchAgent(search_provider=search_provider)
Add live research capabilities with innovation detection 2026-03-13 05:03:52 +00:00			`"""Research Agent for Opus Orchestrator.`

			`Enhanced nonfiction agent with live research capabilities.`
			`"""`

			`import os`
			`from typing import Any, Optional`

			`from dotenv import load_dotenv`


			`from opus_orchestrator.agents.base import BaseAgent, AgentResponse`
			`from opus_orchestrator.utils.research import (`
			`ResearchOrchestrator,`
			`create_research_orchestrator,`
			`SearchTool,`
			`WikipediaTool,`
			`AcademicSearchTool,`
			`)`


			`# System prompt for research agent`
			`RESEARCH_AGENT_SYSTEM_PROMPT = """## Role: Research Agent with Live Web Access`

			`You are The Researcher — an AI agent with live access to the internet, academic databases, and research tools.`

			`## Your Capabilities`

			`1. Web Search - Search the current web for latest information`
			`2. Wikipedia - Access encyclopedic knowledge`
			`3. Academic Search - Find peer-reviewed papers (CrossRef, Semantic Scholar)`
			`4. Innovation Detection - Identify gaps and new ideas beyond training data`

			`## Your Mission`

			`NOT just verify facts — DISCOVER new information, trends, and innovations.`

			`- Find what's NEW since your training cutoff`
			`- Identify research gaps and opportunities`
			`- Connect disparate ideas into novel insights`
			`- Go beyond what you "know" to what you can FIND`

			`## Research Process`

			`1. Explore - Broad search on topic`
			`2. Deep Dive - Specific searches on subtopics`
			`3. Cross-Reference - Find connections between sources`
			`4. Innovate - Generate original insights beyond training data`

			`## Output Format`

			`Provide your research in this structure:`

			```
			`## Findings (What you discovered)`
			`- [New information 1]`
			`- [New information 2]`
			`- [Latest developments]`

			`## Sources (Where you found it)`
			`- [URL 1]: [Title]`
			`- [URL 2]: [Title]`

			`## Innovations (Original insights beyond training data)`
			`- [Novel connection 1]`
			`- [Novel connection 2]`

			`## Research Gaps (What's not well-covered)`
			`- [Gap 1]`
			`- [Gap 2]`
			```

			`## Remember`

			`You're not just fact-checking — you're RESEARCHING. Actively seek new information,`
			`challenge assumptions, and generate original ideas. This keeps the content fresh`
			`and prevents "AI slop" from repetitive training data patterns.`
			`"""`


			`class ResearchAgent(BaseAgent):`
			`"""Enhanced research agent with live web access and innovation detection."""`

			`def __init__(`
			`self,`
			`config=None,`
			`search_provider: str = "tavily",`
			`use_wikipedia: bool = True,`
			`use_academic: bool = True,`
			`):`
			`"""Initialize research agent with tools.`

			`Args:`
			`config: Agent configuration`
			`search_provider: Search provider (tavily, serper, brave, duckduckgo)`
			`use_wikipedia: Include Wikipedia search`
			`use_academic: Include academic search`
			`"""`
			`# Initialize research tools`
			`self.research = create_research_orchestrator(`
			`search_provider=search_provider,`
			`use_wikipedia=use_wikipedia,`
			`use_academic=use_academic,`
			`)`

			`self.search_tool = SearchTool(provider=search_provider)`
			`self.wikipedia = WikipediaTool() if use_wikipedia else None`
			`self.academic = AcademicSearchTool() if use_academic else None`

			`super().__init__(`
			`role="Research Agent",`
			`description="Live web research with innovation detection",`
			`system_prompt=RESEARCH_AGENT_SYSTEM_PROMPT,`
			`config=config,`
			`)`

			`async def execute(self, input_data: Any, context: dict[str, Any]) -> AgentResponse:`
			`"""Execute research task with live tools.`

			`Args:`
			`input_data: Research query and parameters`
			`context: Additional context`

			`Returns:`
			`Research findings with sources and innovations`
			`"""`
			`# Extract query`
			`if isinstance(input_data, dict):`
			`query = input_data.get("query", "")`
			`subtopics = input_data.get("subtopics", [])`
			`deep = input_data.get("deep_research", False)`
			`else:`
			`query = str(input_data)`
			`subtopics = []`
			`deep = False`

			`if not query:`
			`return AgentResponse(`
			`success=False,`
			`output=None,`
			`error="No research query provided",`
			`metadata={"role": "Research Agent"},`
			`)`

			`try:`
			`# Perform research`
			`if deep or subtopics:`
			`# Deep research with subtopics`
			`results = self.research.deep_research(query, subtopics)`
			`else:`
			`# Quick comprehensive search`
			`results = self.research.comprehensive_search(query)`

			`# Format results for LLM`
			`research_summary = self._format_research_for_llm(results)`

			`# Use LLM to synthesize and provide analysis`
			`synthesis = await self.call_llm(`
			`system_prompt=self.build_system_prompt(context),`
			`user_prompt=f"""Based on this research data, provide analysis and insights:`

			`{research_summary}`

			`Task: {query}`

			`Provide:`
			`1. Key findings synthesized`
			`2. Most important innovations/discoveries`
			`3. How this goes beyond typical training data`
			`4. Recommendations for the manuscript""",`
			`)`

			`return AgentResponse(`
			`success=True,`
			`output={`
			`"raw_results": results,`
			`"synthesis": synthesis,`
			`"query": query,`
			`},`
			`metadata={`
			`"role": "Research Agent",`
			`"search_provider": self.research.search.provider,`
			`},`
			`)`

			`except Exception as e:`
			`return AgentResponse(`
			`success=False,`
			`output=None,`
			`error=f"Research failed: {str(e)}",`
			`metadata={"role": "Research Agent"},`
			`)`

			`def _format_research_for_llm(self, results: dict) -> str:`
			`"""Format research results for LLM consumption."""`
			`output = []`

			`# Query`
			`output.append(f"# Research Query: {results.get('query', '')}")`
			`output.append(f"Timestamp: {results.get('timestamp', '')}")`
			`output.append("")`

			`# Web results`
			`web = results.get("web", [])`
			`if web:`
			`output.append("## Web Search Results")`
			`for i, r in enumerate(web[:5], 1):`
			`output.append(f"{i}. {r.get('title', '')}")`
			`output.append(f" URL: {r.get('url', '')}")`
			`output.append(f" {r.get('content', '')[:200]}...")`
			`output.append("")`

			`# Wikipedia`
			`wiki = results.get("wikipedia", [])`
			`if wiki:`
			`output.append("## Wikipedia Results")`
			`for r in wiki[:3]:`
			`output.append(f"- {r.get('title', '')}: {r.get('summary', '')[:200]}...")`
			`output.append("")`

			`# Academic`
			`academic = results.get("academic", [])`
			`if academic:`
			`output.append("## Academic Papers")`
			`for r in academic[:5]:`
			`output.append(f"- {r.get('title', '')} ({r.get('year', 'N/A')})")`
			`output.append(f" {r.get('journal', '')}")`
			`output.append("")`

			`# Innovations`
			`innovations = results.get("innovations", [])`
			`if innovations:`
			`output.append("## Innovations & New Ideas")`
			`for i in innovations:`
			`output.append(f"- {i}")`
			`output.append("")`

			`return "\n".join(output)`


			`# Fact-checking with live verification`
			`class VerifiedFactChecker:`
			`"""Fact checker with live source verification."""`

			`def __init__(self, search_provider: str = "tavily"):`
			`"""Initialize verified fact checker."""`
			`self.search = SearchTool(provider=search_provider)`
			`self.wikipedia = WikipediaTool()`

			`async def verify_claim(`
			`self,`
			`claim: str,`
			`context: str = "",`
			`) -> dict:`
			`"""Verify a factual claim against live sources.`

			`Args:`
			`claim: The claim to verify`
			`context: Additional context`

			`Returns:`
			`Verification result with confidence and sources`
			`"""`
			`# Search for the claim`
			`results = self.search.search(claim, num_results=5)`

			`# Check Wikipedia`
			`wiki_results = self.wikipedia.search(claim, num_results=2)`

			`# Analyze`
			`supporting = []`
			`contradicting = []`
			`neutral = []`

			`for r in results:`
			`content = r.get("content", "").lower()`
			`claim_lower = claim.lower()`

			`# Simple keyword matching`
			`claim_words = set(claim_lower.split())`
			`content_words = set(content.split())`
			`overlap = claim_words & content_words`

			`if len(overlap) > len(claim_words) * 0.7:`
			`supporting.append(r)`
			`elif "not" in content or "false" in content or "incorrect" in content:`
			`contradicting.append(r)`
			`else:`
			`neutral.append(r)`

			`# Calculate confidence`
			`total = len(supporting) + len(contradicting) + len(neutral)`
			`if total == 0:`
			`confidence = 0.0`
			`else:`
			`confidence = len(supporting) / total`

			`return {`
			`"claim": claim,`
			`"verified": len(supporting) > 0,`
			`"confidence": confidence,`
			`"supporting_sources": supporting,`
			`"contradicting_sources": contradicting,`
			`"neutral_sources": neutral,`
			`"needs_citation": confidence < 0.8,`
			`}`

			`async def verify_batch(`
			`self,`
			`claims: list[str],`
			`) -> list[dict]:`
			`"""Verify multiple claims.`

			`Args:`
			`claims: List of claims to verify`

			`Returns:`
			`List of verification results`
			`"""`
			`results = []`
			`for claim in claims:`
			`result = await self.verify_claim(claim)`
			`results.append(result)`
			`return results`


			`def create_research_agent(`
			`search_provider: str = "tavily",`
			`) -> ResearchAgent:`
			`"""Factory to create a research agent.`

			`Args:`
			`search_provider: Search provider`

			`Returns:`
			`Configured ResearchAgent`
			`"""`
			`return ResearchAgent(search_provider=search_provider)`