opus-orchestrator-ai/opus_orchestrator/agents/research.py

"""Research Agent for Opus Orchestrator.

Enhanced nonfiction agent with live research capabilities.
"""

import os
from typing import Any, Optional

from dotenv import load_dotenv


from opus_orchestrator.agents.base import BaseAgent, AgentResponse
from opus_orchestrator.utils.research import (
    ResearchOrchestrator,
    create_research_orchestrator,
    SearchTool,
    WikipediaTool,
    AcademicSearchTool,
)


# System prompt for research agent
RESEARCH_AGENT_SYSTEM_PROMPT = """## Role: Research Agent with Live Web Access

You are The Researcher — an AI agent with live access to the internet, academic databases, and research tools.

## Your Capabilities

1. **Web Search** - Search the current web for latest information
2. **Wikipedia** - Access encyclopedic knowledge
3. **Academic Search** - Find peer-reviewed papers (CrossRef, Semantic Scholar)
4. **Innovation Detection** - Identify gaps and new ideas beyond training data

## Your Mission

NOT just verify facts — **DISCOVER new information, trends, and innovations**.

- Find what's NEW since your training cutoff
- Identify research gaps and opportunities
- Connect disparate ideas into novel insights
- Go beyond what you "know" to what you can FIND

## Research Process

1. **Explore** - Broad search on topic
2. **Deep Dive** - Specific searches on subtopics
3. **Cross-Reference** - Find connections between sources
4. **Innovate** - Generate original insights beyond training data

## Output Format

Provide your research in this structure:

```
## Findings (What you discovered)
- [New information 1]
- [New information 2]
- [Latest developments]

## Sources (Where you found it)
- [URL 1]: [Title]
- [URL 2]: [Title]

## Innovations (Original insights beyond training data)
- [Novel connection 1]
- [Novel connection 2]

## Research Gaps (What's not well-covered)
- [Gap 1]
- [Gap 2]
```

## Remember

You're not just fact-checking — you're RESEARCHING. Actively seek new information,
challenge assumptions, and generate original ideas. This keeps the content fresh
and prevents "AI slop" from repetitive training data patterns.
"""


class ResearchAgent(BaseAgent):
    """Enhanced research agent with live web access and innovation detection."""

    def __init__(
        self,
        config=None,
        search_provider: str = "tavily",
        use_wikipedia: bool = True,
        use_academic: bool = True,
    ):
        """Initialize research agent with tools.

        Args:
            config: Agent configuration
            search_provider: Search provider (tavily, serper, brave, duckduckgo)
            use_wikipedia: Include Wikipedia search
            use_academic: Include academic search
        """
        # Initialize research tools
        self.research = create_research_orchestrator(
            search_provider=search_provider,
            use_wikipedia=use_wikipedia,
            use_academic=use_academic,
        )

        self.search_tool = SearchTool(provider=search_provider)
        self.wikipedia = WikipediaTool() if use_wikipedia else None
        self.academic = AcademicSearchTool() if use_academic else None

        super().__init__(
            role="Research Agent",
            description="Live web research with innovation detection",
            system_prompt=RESEARCH_AGENT_SYSTEM_PROMPT,
            config=config,
        )

    async def execute(self, input_data: Any, context: dict[str, Any]) -> AgentResponse:
        """Execute research task with live tools.

        Args:
            input_data: Research query and parameters
            context: Additional context

        Returns:
            Research findings with sources and innovations
        """
        # Extract query
        if isinstance(input_data, dict):
            query = input_data.get("query", "")
            subtopics = input_data.get("subtopics", [])
            deep = input_data.get("deep_research", False)
        else:
            query = str(input_data)
            subtopics = []
            deep = False

        if not query:
            return AgentResponse(
                success=False,
                output=None,
                error="No research query provided",
                metadata={"role": "Research Agent"},
            )

        try:
            # Perform research
            if deep or subtopics:
                # Deep research with subtopics
                results = self.research.deep_research(query, subtopics)
            else:
                # Quick comprehensive search
                results = self.research.comprehensive_search(query)

            # Format results for LLM
            research_summary = self._format_research_for_llm(results)

            # Use LLM to synthesize and provide analysis
            synthesis = await self.call_llm(
                system_prompt=self.build_system_prompt(context),
                user_prompt=f"""Based on this research data, provide analysis and insights:

{research_summary}

Task: {query}

Provide:
1. Key findings synthesized
2. Most important innovations/discoveries
3. How this goes beyond typical training data
4. Recommendations for the manuscript""",
            )

            return AgentResponse(
                success=True,
                output={
                    "raw_results": results,
                    "synthesis": synthesis,
                    "query": query,
                },
                metadata={
                    "role": "Research Agent",
                    "search_provider": self.research.search.provider,
                },
            )

        except Exception as e:
            return AgentResponse(
                success=False,
                output=None,
                error=f"Research failed: {str(e)}",
                metadata={"role": "Research Agent"},
            )

    def _format_research_for_llm(self, results: dict) -> str:
        """Format research results for LLM consumption."""
        output = []

        # Query
        output.append(f"# Research Query: {results.get('query', '')}")
        output.append(f"Timestamp: {results.get('timestamp', '')}")
        output.append("")

        # Web results
        web = results.get("web", [])
        if web:
            output.append("## Web Search Results")
            for i, r in enumerate(web[:5], 1):
                output.append(f"{i}. **{r.get('title', '')}**")
                output.append(f"   URL: {r.get('url', '')}")
                output.append(f"   {r.get('content', '')[:200]}...")
                output.append("")

        # Wikipedia
        wiki = results.get("wikipedia", [])
        if wiki:
            output.append("## Wikipedia Results")
            for r in wiki[:3]:
                output.append(f"- {r.get('title', '')}: {r.get('summary', '')[:200]}...")
            output.append("")

        # Academic
        academic = results.get("academic", [])
        if academic:
            output.append("## Academic Papers")
            for r in academic[:5]:
                output.append(f"- {r.get('title', '')} ({r.get('year', 'N/A')})")
                output.append(f"  {r.get('journal', '')}")
            output.append("")

        # Innovations
        innovations = results.get("innovations", [])
        if innovations:
            output.append("## Innovations & New Ideas")
            for i in innovations:
                output.append(f"- {i}")
            output.append("")

        return "\n".join(output)


# Fact-checking with live verification
class VerifiedFactChecker:
    """Fact checker with live source verification."""

    def __init__(self, search_provider: str = "tavily"):
        """Initialize verified fact checker."""
        self.search = SearchTool(provider=search_provider)
        self.wikipedia = WikipediaTool()

    async def verify_claim(
        self,
        claim: str,
        context: str = "",
    ) -> dict:
        """Verify a factual claim against live sources.

        Args:
            claim: The claim to verify
            context: Additional context

        Returns:
            Verification result with confidence and sources
        """
        # Search for the claim
        results = self.search.search(claim, num_results=5)

        # Check Wikipedia
        wiki_results = self.wikipedia.search(claim, num_results=2)

        # Analyze
        supporting = []
        contradicting = []
        neutral = []

        for r in results:
            content = r.get("content", "").lower()
            claim_lower = claim.lower()

            # Simple keyword matching
            claim_words = set(claim_lower.split())
            content_words = set(content.split())
            overlap = claim_words & content_words

            if len(overlap) > len(claim_words) * 0.7:
                supporting.append(r)
            elif "not" in content or "false" in content or "incorrect" in content:
                contradicting.append(r)
            else:
                neutral.append(r)

        # Calculate confidence
        total = len(supporting) + len(contradicting) + len(neutral)
        if total == 0:
            confidence = 0.0
        else:
            confidence = len(supporting) / total

        return {
            "claim": claim,
            "verified": len(supporting) > 0,
            "confidence": confidence,
            "supporting_sources": supporting,
            "contradicting_sources": contradicting,
            "neutral_sources": neutral,
            "needs_citation": confidence < 0.8,
        }

    async def verify_batch(
        self,
        claims: list[str],
    ) -> list[dict]:
        """Verify multiple claims.

        Args:
            claims: List of claims to verify

        Returns:
            List of verification results
        """
        results = []
        for claim in claims:
            result = await self.verify_claim(claim)
            results.append(result)
        return results


def create_research_agent(
    search_provider: str = "tavily",
) -> ResearchAgent:
    """Factory to create a research agent.

    Args:
        search_provider: Search provider

    Returns:
        Configured ResearchAgent
    """
    return ResearchAgent(search_provider=search_provider)