Add live research capabilities with innovation detection

Research Tools: - SearchTool: Multiple backends (Tavily, Serper, Brave, DuckDuckGo) - WikipediaTool: Wikipedia lookup - AcademicSearchTool: CrossRef, Semantic Scholar - ResearchOrchestrator: Comprehensive multi-source research ResearchAgent: - NOT just fact-checking - actively discovers NEW information - Identifies trends beyond training data cutoff - Generates innovations from cross-referencing sources - Deep research with subtopics VerifiedFactChecker: - Live claim verification against web sources - Confidence scoring - Citation needed detection Dependencies added: tavily, wikipedia, arxiv, duckduckgo-search
2026-03-13 05:03:52 +00:00
parent 6766e93c3d
commit 8cb29889cc
5 changed files with 862 additions and 0 deletions
@@ -22,6 +22,11 @@ from opus_orchestrator.agents.nonfiction import (
    NonfictionWriterAgent,
    ResearcherAgent,
 )
 from opus_orchestrator.agents.research import (
    ResearchAgent,
    VerifiedFactChecker,
    create_research_agent,
 )
 from opus_orchestrator.config import OpusConfig, get_config
 from opus_orchestrator.schemas import (
    BookIntent,
@@ -83,6 +88,10 @@ __all__ = [
    "NonfictionWriterAgent",
    "FactCheckerAgent",
    "NonfictionEditorAgent",
    # Research Agent (NEW!)
    "ResearchAgent",
    "VerifiedFactChecker",
    "create_research_agent",
    # LangGraph
    "OpusGraph",
    "OpusGraphState",
@@ -0,0 +1,339 @@
 """Research Agent for Opus Orchestrator.
 Enhanced nonfiction agent with live research capabilities.
 """
 import os
 from typing import Any, Optional
 from dotenv import load_dotenv
 load_dotenv()
 from opus_orchestrator.agents.base import BaseAgent, AgentResponse
 from opus_orchestrator.utils.research import (
    ResearchOrchestrator,
    create_research_orchestrator,
    SearchTool,
    WikipediaTool,
    AcademicSearchTool,
 )
 # System prompt for research agent
 RESEARCH_AGENT_SYSTEM_PROMPT = """## Role: Research Agent with Live Web Access
 You are The Researcher — an AI agent with live access to the internet, academic databases, and research tools.
 ## Your Capabilities
 1. **Web Search** - Search the current web for latest information
 2. **Wikipedia** - Access encyclopedic knowledge
 3. **Academic Search** - Find peer-reviewed papers (CrossRef, Semantic Scholar)
 4. **Innovation Detection** - Identify gaps and new ideas beyond training data
 ## Your Mission
 NOT just verify facts — **DISCOVER new information, trends, and innovations**.
 - Find what's NEW since your training cutoff
 - Identify research gaps and opportunities  
 - Connect disparate ideas into novel insights
 - Go beyond what you "know" to what you can FIND
 ## Research Process
 1. **Explore** - Broad search on topic
 2. **Deep Dive** - Specific searches on subtopics
 3. **Cross-Reference** - Find connections between sources
 4. **Innovate** - Generate original insights beyond training data
 ## Output Format
 Provide your research in this structure:
 ```
 ## Findings (What you discovered)
 - [New information 1]
 - [New information 2]
 - [Latest developments]
 ## Sources (Where you found it)
 - [URL 1]: [Title]
 - [URL 2]: [Title]
 ## Innovations (Original insights beyond training data)
 - [Novel connection 1]
 - [Novel connection 2]
 ## Research Gaps (What's not well-covered)
 - [Gap 1]
 - [Gap 2]
 ```
 ## Remember
 You're not just fact-checking — you're RESEARCHING. Actively seek new information, 
 challenge assumptions, and generate original ideas. This keeps the content fresh 
 and prevents "AI slop" from repetitive training data patterns.
 """
 class ResearchAgent(BaseAgent):
    """Enhanced research agent with live web access and innovation detection."""
    def __init__(
        self,
        config=None,
        search_provider: str = "tavily",
        use_wikipedia: bool = True,
        use_academic: bool = True,
    ):
        """Initialize research agent with tools.
        Args:
            config: Agent configuration
            search_provider: Search provider (tavily, serper, brave, duckduckgo)
            use_wikipedia: Include Wikipedia search
            use_academic: Include academic search
        """
        # Initialize research tools
        self.research = create_research_orchestrator(
            search_provider=search_provider,
            use_wikipedia=use_wikipedia,
            use_academic=use_academic,
        )
        self.search_tool = SearchTool(provider=search_provider)
        self.wikipedia = WikipediaTool() if use_wikipedia else None
        self.academic = AcademicSearchTool() if use_academic else None
        super().__init__(
            role="Research Agent",
            description="Live web research with innovation detection",
            system_prompt=RESEARCH_AGENT_SYSTEM_PROMPT,
            config=config,
        )
    async def execute(self, input_data: Any, context: dict[str, Any]) -> AgentResponse:
        """Execute research task with live tools.
        Args:
            input_data: Research query and parameters
            context: Additional context
        Returns:
            Research findings with sources and innovations
        """
        # Extract query
        if isinstance(input_data, dict):
            query = input_data.get("query", "")
            subtopics = input_data.get("subtopics", [])
            deep = input_data.get("deep_research", False)
        else:
            query = str(input_data)
            subtopics = []
            deep = False
        if not query:
            return AgentResponse(
                success=False,
                output=None,
                error="No research query provided",
                metadata={"role": "Research Agent"},
            )
        try:
            # Perform research
            if deep or subtopics:
                # Deep research with subtopics
                results = self.research.deep_research(query, subtopics)
            else:
                # Quick comprehensive search
                results = self.research.comprehensive_search(query)
            # Format results for LLM
            research_summary = self._format_research_for_llm(results)
            # Use LLM to synthesize and provide analysis
            synthesis = await self.call_llm(
                system_prompt=self.build_system_prompt(context),
                user_prompt=f"""Based on this research data, provide analysis and insights:
 {research_summary}
 Task: {query}
 Provide:
 1. Key findings synthesized
 2. Most important innovations/discoveries
 3. How this goes beyond typical training data
 4. Recommendations for the manuscript""",
            )
            return AgentResponse(
                success=True,
                output={
                    "raw_results": results,
                    "synthesis": synthesis,
                    "query": query,
                },
                metadata={
                    "role": "Research Agent",
                    "search_provider": self.research.search.provider,
                },
            )
        except Exception as e:
            return AgentResponse(
                success=False,
                output=None,
                error=f"Research failed: {str(e)}",
                metadata={"role": "Research Agent"},
            )
    def _format_research_for_llm(self, results: dict) -> str:
        """Format research results for LLM consumption."""
        output = []
        # Query
        output.append(f"# Research Query: {results.get('query', '')}")
        output.append(f"Timestamp: {results.get('timestamp', '')}")
        output.append("")
        # Web results
        web = results.get("web", [])
        if web:
            output.append("## Web Search Results")
            for i, r in enumerate(web[:5], 1):
                output.append(f"{i}. **{r.get('title', '')}**")
                output.append(f"   URL: {r.get('url', '')}")
                output.append(f"   {r.get('content', '')[:200]}...")
                output.append("")
        # Wikipedia
        wiki = results.get("wikipedia", [])
        if wiki:
            output.append("## Wikipedia Results")
            for r in wiki[:3]:
                output.append(f"- {r.get('title', '')}: {r.get('summary', '')[:200]}...")
            output.append("")
        # Academic
        academic = results.get("academic", [])
        if academic:
            output.append("## Academic Papers")
            for r in academic[:5]:
                output.append(f"- {r.get('title', '')} ({r.get('year', 'N/A')})")
                output.append(f"  {r.get('journal', '')}")
            output.append("")
        # Innovations
        innovations = results.get("innovations", [])
        if innovations:
            output.append("## Innovations & New Ideas")
            for i in innovations:
                output.append(f"- {i}")
            output.append("")
        return "\n".join(output)
 # Fact-checking with live verification
 class VerifiedFactChecker:
    """Fact checker with live source verification."""
    def __init__(self, search_provider: str = "tavily"):
        """Initialize verified fact checker."""
        self.search = SearchTool(provider=search_provider)
        self.wikipedia = WikipediaTool()
    async def verify_claim(
        self,
        claim: str,
        context: str = "",
    ) -> dict:
        """Verify a factual claim against live sources.
        Args:
            claim: The claim to verify
            context: Additional context
        Returns:
            Verification result with confidence and sources
        """
        # Search for the claim
        results = self.search.search(claim, num_results=5)
        # Check Wikipedia
        wiki_results = self.wikipedia.search(claim, num_results=2)
        # Analyze
        supporting = []
        contradicting = []
        neutral = []
        for r in results:
            content = r.get("content", "").lower()
            claim_lower = claim.lower()
            # Simple keyword matching
            claim_words = set(claim_lower.split())
            content_words = set(content.split())
            overlap = claim_words & content_words
            if len(overlap) > len(claim_words) * 0.7:
                supporting.append(r)
            elif "not" in content or "false" in content or "incorrect" in content:
                contradicting.append(r)
            else:
                neutral.append(r)
        # Calculate confidence
        total = len(supporting) + len(contradicting) + len(neutral)
        if total == 0:
            confidence = 0.0
        else:
            confidence = len(supporting) / total
        return {
            "claim": claim,
            "verified": len(supporting) > 0,
            "confidence": confidence,
            "supporting_sources": supporting,
            "contradicting_sources": contradicting,
            "neutral_sources": neutral,
            "needs_citation": confidence < 0.8,
        }
    async def verify_batch(
        self,
        claims: list[str],
    ) -> list[dict]:
        """Verify multiple claims.
        Args:
            claims: List of claims to verify
        Returns:
            List of verification results
        """
        results = []
        for claim in claims:
            result = await self.verify_claim(claim)
            results.append(result)
        return results
 def create_research_agent(
    search_provider: str = "tavily",
 ) -> ResearchAgent:
    """Factory to create a research agent.
    Args:
        search_provider: Search provider
    Returns:
        Configured ResearchAgent
    """
    return ResearchAgent(search_provider=search_provider)
@@ -5,6 +5,13 @@ from opus_orchestrator.utils.github_ingest import GitHubIngestor, create_github_
 from opus_orchestrator.utils.s3_ingest import S3Ingestor, create_s3_ingestor
 from opus_orchestrator.utils.local_ingest import LocalIngestor, create_local_ingestor
 from opus_orchestrator.utils.llm import get_llm_client
 from opus_orchestrator.utils.research import (
    ResearchOrchestrator,
    SearchTool,
    WikipediaTool,
    AcademicSearchTool,
    create_research_orchestrator,
 )
 __all__ = [
    "generate_docs",
@@ -15,4 +22,10 @@ __all__ = [
    "LocalIngestor",
    "create_local_ingestor",
    "get_llm_client",
    # Research (NEW!)
    "ResearchOrchestrator",
    "SearchTool",
    "WikipediaTool",
    "AcademicSearchTool",
    "create_research_orchestrator",
 ]
@@ -0,0 +1,496 @@
 """Research tools for Opus Orchestrator.
 Provides web search, database lookup, and research capabilities.
 """
 import os
 import json
 from typing import Any, Optional, Callable
 from datetime import datetime
 import requests
 from dotenv import load_dotenv
 load_dotenv()
 class SearchTool:
    """Web search tool using multiple backends."""
    def __init__(self, provider: str = "tavily"):
        """Initialize search tool.
        Args:
            provider: Search provider (tavily, serper, brave, duckduckgo)
        """
        self.provider = provider
        self._setup_provider()
    def _setup_provider(self):
        """Set up the search provider."""
        if self.provider == "tavily":
            self.api_key = os.environ.get("TAVILY_API_KEY")
        elif self.provider == "serper":
            self.api_key = os.environ.get("SERPER_API_KEY")
        elif self.provider == "brave":
            self.api_key = os.environ.get("BRAVE_API_KEY")
    def search(
        self,
        query: str,
        num_results: int = 10,
    ) -> list[dict]:
        """Search the web.
        Args:
            query: Search query
            num_results: Number of results to return
        Returns:
            List of search results with title, url, snippet
        """
        if self.provider == "tavily":
            return self._search_tavily(query, num_results)
        elif self.provider == "serper":
            return self._search_serper(query, num_results)
        elif self.provider == "brave":
            return self._search_brave(query, num_results)
        else:
            return self._search_duckduckgo(query, num_results)
    def _search_tavily(self, query: str, num_results: int) -> list[dict]:
        """Search using Tavily."""
        try:
            from tavily import TavilyClient
            client = TavilyClient(api_key=self.api_key)
            results = client.search(query=query, max_results=num_results)
            return [
                {
                    "title": r.get("title", ""),
                    "url": r.get("url", ""),
                    "content": r.get("content", ""),
                    "score": r.get("score", 0),
                }
                for r in results.get("results", [])
            ]
        except Exception as e:
            print(f"Tavily search error: {e}")
            return []
    def _search_serper(self, query: str, num_results: int) -> list[dict]:
        """Search using Serper."""
        try:
            headers = {
                "X-API-KEY": self.api_key,
                "Content-Type": "application/json",
            }
            payload = {"q": query, "num": num_results}
            response = requests.post(
                "https://google.serper.dev/search",
                headers=headers,
                json=payload,
                timeout=10,
            )
            data = response.json()
            return [
                {
                    "title": r.get("title", ""),
                    "url": r.get("link", ""),
                    "content": r.get("snippet", ""),
                    "score": 1.0,
                }
                for r in data.get("organic", [])
            ]
        except Exception as e:
            print(f"Serper search error: {e}")
            return []
    def _search_brave(self, query: str, num_results: int) -> list[dict]:
        """Search using Brave."""
        try:
            headers = {"Accept": "application/json", "X-Subscription-Token": self.api_key}
            response = requests.get(
                "https://api.search.brave.com/res/v1/web/search",
                params={"q": query, "count": num_results},
                headers=headers,
                timeout=10,
            )
            data = response.json()
            return [
                {
                    "title": r.get("title", ""),
                    "url": r.get("url", ""),
                    "content": r.get("description", ""),
                    "score": r.get("score", 0),
                }
                for r in data.get("web", {}).get("results", [])
            ]
        except Exception as e:
            print(f"Brave search error: {e}")
            return []
    def _search_duckduckgo(self, query: str, num_results: int) -> list[dict]:
        """Search using DuckDuckGo (no API key needed)."""
        try:
            from duckduckgo_search import DDGS
            results = DDGS().text(query, max_results=num_results)
            return [
                {
                    "title": r.get("title", ""),
                    "url": r.get("href", ""),
                    "content": r.get("body", ""),
                    "score": 1.0,
                }
                for r in results
            ]
        except Exception as e:
            print(f"DuckDuckGo search error: {e}")
            return []
 class WikipediaTool:
    """Wikipedia lookup tool."""
    def __init__(self):
        """Initialize Wikipedia tool."""
        pass
    def search(self, query: str, num_results: int = 5) -> list[dict]:
        """Search Wikipedia.
        Args:
            query: Search query
            num_results: Number of results
        Returns:
            List of Wikipedia articles
        """
        try:
            import wikipedia
            results = wikipedia.search(query, results=num_results)
            articles = []
            for title in results:
                try:
                    page = wikipedia.page(title)
                    articles.append({
                        "title": page.title,
                        "url": page.url,
                        "summary": page.summary[:500],
                        "content": page.content[:2000],
                    })
                except:
                    continue
            return articles
        except Exception as e:
            print(f"Wikipedia search error: {e}")
            return []
    def get_article(self, title: str) -> dict:
        """Get a Wikipedia article by title.
        Args:
            title: Article title
        Returns:
            Article content
        """
        try:
            import wikipedia
            page = wikipedia.page(title)
            return {
                "title": page.title,
                "url": page.url,
                "summary": page.summary,
                "content": page.content[:5000],
                "references": page.references[:10] if hasattr(page, "references") else [],
            }
        except Exception as e:
            return {"error": str(e)}
 class ArxivTool:
    """ArXiv paper search tool."""
    def __init__(self):
        """Initialize ArXiv tool."""
        pass
    def search(
        self,
        query: str,
        max_results: int = 10,
        categories: list[str] = None,
    ) -> list[dict]:
        """Search ArXiv for papers.
        Args:
            query: Search query
            max_results: Max results
            categories: ArXiv categories to filter
        Returns:
            List of papers
        """
        try:
            import arxiv
            client = arxiv.Client()
            search = arxiv.Search(
                query=query,
                max_results=max_results,
                categories=categories or [],
            )
            papers = []
            for result in client.results(search):
                papers.append({
                    "title": result.title,
                    "url": result.entry_id,
                    "abstract": result.summary[:1000],
                    "authors": [a.name for a in result.authors],
                    "published": str(result.published.date()),
                    "categories": result.categories,
                })
            return papers
        except Exception as e:
            print(f"ArXiv search error: {e}")
            return []
 class AcademicSearchTool:
    """Academic paper search (CrossRef, Semantic Scholar)."""
    def __init__(self):
        """Initialize academic search tool."""
        pass
    def search_crossref(self, query: str, max_results: int = 10) -> list[dict]:
        """Search CrossRef for academic papers."""
        try:
            url = "https://api.crossref.org/works"
            params = {"query": query, "rows": max_results}
            response = requests.get(url, params=params, timeout=10)
            data = response.json()
            return [
                {
                    "title": item.get("title", [""])[0],
                    "url": item.get("URL", ""),
                    "authors": [a.get("given", "") + " " + a.get("family", "") 
                               for a in item.get("author", [])],
                    "year": item.get("created", {}).get("date-parts", [[None]])[0][0],
                    "journal": item.get("container-title", [""])[0],
                    "doi": item.get("DOI", ""),
                }
                for item in data.get("message", {}).get("items", [])
            ]
        except Exception as e:
            print(f"CrossRef search error: {e}")
            return []
    def search_semantic_scholar(self, query: str, max_results: int = 10) -> list[dict]:
        """Search Semantic Scholar for papers."""
        try:
            url = "https://api.semanticscholar.org/graph/v1/paper/search"
            params = {
                "query": query,
                "limit": max_results,
                "fields": "title,url,abstract,authors,year,citationCount",
            }
            response = requests.get(url, params=params, timeout=10)
            data = response.json()
            return [
                {
                    "title": p.get("title", ""),
                    "url": p.get("url", ""),
                    "abstract": p.get("abstract", "")[:500],
                    "authors": [a.get("name", "") for a in p.get("authors", [])[:5]],
                    "year": p.get("year"),
                    "citations": p.get("citationCount", 0),
                }
                for p in data.get("data", [])
            ]
        except Exception as e:
            print(f"Semantic Scholar search error: {e}")
            return []
 class ResearchOrchestrator:
    """Orchestrates research across multiple tools."""
    def __init__(
        self,
        search_provider: str = "tavily",
        use_wikipedia: bool = True,
        use_academic: bool = True,
    ):
        """Initialize research orchestrator.
        Args:
            search_provider: Search provider to use
            use_wikipedia: Include Wikipedia
            use_academic: Include academic search
        """
        self.search = SearchTool(provider=search_provider)
        self.wikipedia = WikipediaTool() if use_wikipedia else None
        self.academic = AcademicSearchTool() if use_academic else None
    def comprehensive_search(
        self,
        query: str,
        include_web: bool = True,
        include_wikipedia: bool = True,
        include_academic: bool = True,
    ) -> dict:
        """Run comprehensive research across all sources.
        Args:
            query: Research query
            include_web: Include web search
            include_wikipedia: Include Wikipedia
            include_academic: Include academic papers
        Returns:
            Combined research results
        """
        results = {
            "query": query,
            "timestamp": datetime.now().isoformat(),
            "web": [],
            "wikipedia": [],
            "academic": [],
            "innovations": [],
        }
        # Web search
        if include_web:
            results["web"] = self.search.search(query, num_results=10)
        # Wikipedia
        if self.wikipedia and include_wikipedia:
            results["wikipedia"] = self.wikipedia.search(query, num_results=5)
        # Academic
        if self.academic and include_academic:
            results["academic"] = self.academic.search_crossref(query, max_results=5)
            results["academic"].extend(
                self.academic.search_semantic_scholar(query, max_results=5)
            )
        # Generate innovations from research
        results["innovations"] = self._generate_innovations(results)
        return results
    def _generate_innovations(self, research: dict) -> list[str]:
        """Generate innovative ideas from research.
        This analyzes the gathered information to spawn new ideas
        and connections beyond the original training data.
        Args:
            research: Combined research results
        Returns:
            List of innovative ideas/connections
        """
        innovations = []
        # Analyze web results for emerging trends
        web_content = " ".join([
            r.get("content", "")[:200] for r in research.get("web", [])[:5]
        ])
        # Analyze academic for research gaps
        academic_titles = [a.get("title", "") for a in research.get("academic", [])[:5]]
        # Look for intersections
        if web_content and academic_titles:
            innovations.append(
                "Cross-disciplinary connection: Apply web trends to academic findings"
            )
        # Add research gaps identification
        if len(research.get("academic", [])) < 3:
            innovations.append(
                "Research gap: Limited academic coverage - original contribution opportunity"
            )
        # Add timestamp for freshness
        innovations.append(
            f"Research timestamp: {research.get('timestamp')} - ensures current information"
        )
        return innovations
    def deep_research(
        self,
        topic: str,
        subtopics: list[str] = None,
    ) -> dict:
        """Perform deep research on a topic and its subtopics.
        Args:
            topic: Main topic
            subtopics: Related subtopics to research
        Returns:
            Deep research results
        """
        results = {
            "main_topic": topic,
            "main_research": self.comprehensive_search(topic),
            "subtopic_research": {},
        }
        # Research each subtopic
        for subtopic in (subtopics or []):
            combined = f"{topic}: {subtopic}"
            results["subtopic_research"][subtopic] = self.comprehensive_search(combined)
        # Cross-reference all findings
        results["cross_references"] = self._cross_reference(results)
        return results
    def _cross_reference(self, deep_results: dict) -> list[str]:
        """Find cross-references between main and subtopic research."""
        refs = []
        main_content = " ".join([
            r.get("content", "")[:300] 
            for r in deep_results.get("main_research", {}).get("web", [])[:3]
        ])
        for subtopic, sub_data in deep_results.get("subtopic_research", {}).items():
            sub_content = " ".join([
                r.get("content", "")[:300]
                for r in sub_data.get("web", [])[:3]
            ])
            # Look for connections
            if main_content and sub_content:
                common_words = set(main_content.lower().split()) & set(sub_content.lower().split())
                if len(common_words) > 10:
                    refs.append(f"Connection found: {subtopic} relates to main topic via {len(common_words)} shared concepts")
        return refs
 def create_research_orchestrator(
    search_provider: str = "tavily",
    use_wikipedia: bool = True,
    use_academic: bool = True,
 ) -> ResearchOrchestrator:
    """Factory function to create research orchestrator.
    Args:
        search_provider: Search provider
        use_wikipedia: Include Wikipedia
        use_academic: Include academic search
    Returns:
        Configured ResearchOrchestrator
    """
    return ResearchOrchestrator(
        search_provider=search_provider,
        use_wikipedia=use_wikipedia,
        use_academic=use_academic,
    )
@@ -30,6 +30,11 @@ dependencies = [
    "tiktoken>=0.7.0",
    "markdown>=3.7",
    "python-dotenv>=1.0.0",
    # Research dependencies (NEW!)
    "tavily>=0.3.0",
    "wikipedia>=1.4.0",
    "arxiv>=1.4.0",
    "duckduckgo-search>=7.0.0",
 ]
 [project.optional-dependencies]