Add live research capabilities with innovation detection
Research Tools: - SearchTool: Multiple backends (Tavily, Serper, Brave, DuckDuckGo) - WikipediaTool: Wikipedia lookup - AcademicSearchTool: CrossRef, Semantic Scholar - ResearchOrchestrator: Comprehensive multi-source research ResearchAgent: - NOT just fact-checking - actively discovers NEW information - Identifies trends beyond training data cutoff - Generates innovations from cross-referencing sources - Deep research with subtopics VerifiedFactChecker: - Live claim verification against web sources - Confidence scoring - Citation needed detection Dependencies added: tavily, wikipedia, arxiv, duckduckgo-search
This commit is contained in:
@@ -22,6 +22,11 @@ from opus_orchestrator.agents.nonfiction import (
|
|||||||
NonfictionWriterAgent,
|
NonfictionWriterAgent,
|
||||||
ResearcherAgent,
|
ResearcherAgent,
|
||||||
)
|
)
|
||||||
|
from opus_orchestrator.agents.research import (
|
||||||
|
ResearchAgent,
|
||||||
|
VerifiedFactChecker,
|
||||||
|
create_research_agent,
|
||||||
|
)
|
||||||
from opus_orchestrator.config import OpusConfig, get_config
|
from opus_orchestrator.config import OpusConfig, get_config
|
||||||
from opus_orchestrator.schemas import (
|
from opus_orchestrator.schemas import (
|
||||||
BookIntent,
|
BookIntent,
|
||||||
@@ -83,6 +88,10 @@ __all__ = [
|
|||||||
"NonfictionWriterAgent",
|
"NonfictionWriterAgent",
|
||||||
"FactCheckerAgent",
|
"FactCheckerAgent",
|
||||||
"NonfictionEditorAgent",
|
"NonfictionEditorAgent",
|
||||||
|
# Research Agent (NEW!)
|
||||||
|
"ResearchAgent",
|
||||||
|
"VerifiedFactChecker",
|
||||||
|
"create_research_agent",
|
||||||
# LangGraph
|
# LangGraph
|
||||||
"OpusGraph",
|
"OpusGraph",
|
||||||
"OpusGraphState",
|
"OpusGraphState",
|
||||||
|
|||||||
@@ -0,0 +1,339 @@
|
|||||||
|
"""Research Agent for Opus Orchestrator.
|
||||||
|
|
||||||
|
Enhanced nonfiction agent with live research capabilities.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from typing import Any, Optional
|
||||||
|
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
from opus_orchestrator.agents.base import BaseAgent, AgentResponse
|
||||||
|
from opus_orchestrator.utils.research import (
|
||||||
|
ResearchOrchestrator,
|
||||||
|
create_research_orchestrator,
|
||||||
|
SearchTool,
|
||||||
|
WikipediaTool,
|
||||||
|
AcademicSearchTool,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# System prompt for research agent
|
||||||
|
RESEARCH_AGENT_SYSTEM_PROMPT = """## Role: Research Agent with Live Web Access
|
||||||
|
|
||||||
|
You are The Researcher — an AI agent with live access to the internet, academic databases, and research tools.
|
||||||
|
|
||||||
|
## Your Capabilities
|
||||||
|
|
||||||
|
1. **Web Search** - Search the current web for latest information
|
||||||
|
2. **Wikipedia** - Access encyclopedic knowledge
|
||||||
|
3. **Academic Search** - Find peer-reviewed papers (CrossRef, Semantic Scholar)
|
||||||
|
4. **Innovation Detection** - Identify gaps and new ideas beyond training data
|
||||||
|
|
||||||
|
## Your Mission
|
||||||
|
|
||||||
|
NOT just verify facts — **DISCOVER new information, trends, and innovations**.
|
||||||
|
|
||||||
|
- Find what's NEW since your training cutoff
|
||||||
|
- Identify research gaps and opportunities
|
||||||
|
- Connect disparate ideas into novel insights
|
||||||
|
- Go beyond what you "know" to what you can FIND
|
||||||
|
|
||||||
|
## Research Process
|
||||||
|
|
||||||
|
1. **Explore** - Broad search on topic
|
||||||
|
2. **Deep Dive** - Specific searches on subtopics
|
||||||
|
3. **Cross-Reference** - Find connections between sources
|
||||||
|
4. **Innovate** - Generate original insights beyond training data
|
||||||
|
|
||||||
|
## Output Format
|
||||||
|
|
||||||
|
Provide your research in this structure:
|
||||||
|
|
||||||
|
```
|
||||||
|
## Findings (What you discovered)
|
||||||
|
- [New information 1]
|
||||||
|
- [New information 2]
|
||||||
|
- [Latest developments]
|
||||||
|
|
||||||
|
## Sources (Where you found it)
|
||||||
|
- [URL 1]: [Title]
|
||||||
|
- [URL 2]: [Title]
|
||||||
|
|
||||||
|
## Innovations (Original insights beyond training data)
|
||||||
|
- [Novel connection 1]
|
||||||
|
- [Novel connection 2]
|
||||||
|
|
||||||
|
## Research Gaps (What's not well-covered)
|
||||||
|
- [Gap 1]
|
||||||
|
- [Gap 2]
|
||||||
|
```
|
||||||
|
|
||||||
|
## Remember
|
||||||
|
|
||||||
|
You're not just fact-checking — you're RESEARCHING. Actively seek new information,
|
||||||
|
challenge assumptions, and generate original ideas. This keeps the content fresh
|
||||||
|
and prevents "AI slop" from repetitive training data patterns.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class ResearchAgent(BaseAgent):
|
||||||
|
"""Enhanced research agent with live web access and innovation detection."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
config=None,
|
||||||
|
search_provider: str = "tavily",
|
||||||
|
use_wikipedia: bool = True,
|
||||||
|
use_academic: bool = True,
|
||||||
|
):
|
||||||
|
"""Initialize research agent with tools.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config: Agent configuration
|
||||||
|
search_provider: Search provider (tavily, serper, brave, duckduckgo)
|
||||||
|
use_wikipedia: Include Wikipedia search
|
||||||
|
use_academic: Include academic search
|
||||||
|
"""
|
||||||
|
# Initialize research tools
|
||||||
|
self.research = create_research_orchestrator(
|
||||||
|
search_provider=search_provider,
|
||||||
|
use_wikipedia=use_wikipedia,
|
||||||
|
use_academic=use_academic,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.search_tool = SearchTool(provider=search_provider)
|
||||||
|
self.wikipedia = WikipediaTool() if use_wikipedia else None
|
||||||
|
self.academic = AcademicSearchTool() if use_academic else None
|
||||||
|
|
||||||
|
super().__init__(
|
||||||
|
role="Research Agent",
|
||||||
|
description="Live web research with innovation detection",
|
||||||
|
system_prompt=RESEARCH_AGENT_SYSTEM_PROMPT,
|
||||||
|
config=config,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def execute(self, input_data: Any, context: dict[str, Any]) -> AgentResponse:
|
||||||
|
"""Execute research task with live tools.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input_data: Research query and parameters
|
||||||
|
context: Additional context
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Research findings with sources and innovations
|
||||||
|
"""
|
||||||
|
# Extract query
|
||||||
|
if isinstance(input_data, dict):
|
||||||
|
query = input_data.get("query", "")
|
||||||
|
subtopics = input_data.get("subtopics", [])
|
||||||
|
deep = input_data.get("deep_research", False)
|
||||||
|
else:
|
||||||
|
query = str(input_data)
|
||||||
|
subtopics = []
|
||||||
|
deep = False
|
||||||
|
|
||||||
|
if not query:
|
||||||
|
return AgentResponse(
|
||||||
|
success=False,
|
||||||
|
output=None,
|
||||||
|
error="No research query provided",
|
||||||
|
metadata={"role": "Research Agent"},
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Perform research
|
||||||
|
if deep or subtopics:
|
||||||
|
# Deep research with subtopics
|
||||||
|
results = self.research.deep_research(query, subtopics)
|
||||||
|
else:
|
||||||
|
# Quick comprehensive search
|
||||||
|
results = self.research.comprehensive_search(query)
|
||||||
|
|
||||||
|
# Format results for LLM
|
||||||
|
research_summary = self._format_research_for_llm(results)
|
||||||
|
|
||||||
|
# Use LLM to synthesize and provide analysis
|
||||||
|
synthesis = await self.call_llm(
|
||||||
|
system_prompt=self.build_system_prompt(context),
|
||||||
|
user_prompt=f"""Based on this research data, provide analysis and insights:
|
||||||
|
|
||||||
|
{research_summary}
|
||||||
|
|
||||||
|
Task: {query}
|
||||||
|
|
||||||
|
Provide:
|
||||||
|
1. Key findings synthesized
|
||||||
|
2. Most important innovations/discoveries
|
||||||
|
3. How this goes beyond typical training data
|
||||||
|
4. Recommendations for the manuscript""",
|
||||||
|
)
|
||||||
|
|
||||||
|
return AgentResponse(
|
||||||
|
success=True,
|
||||||
|
output={
|
||||||
|
"raw_results": results,
|
||||||
|
"synthesis": synthesis,
|
||||||
|
"query": query,
|
||||||
|
},
|
||||||
|
metadata={
|
||||||
|
"role": "Research Agent",
|
||||||
|
"search_provider": self.research.search.provider,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return AgentResponse(
|
||||||
|
success=False,
|
||||||
|
output=None,
|
||||||
|
error=f"Research failed: {str(e)}",
|
||||||
|
metadata={"role": "Research Agent"},
|
||||||
|
)
|
||||||
|
|
||||||
|
def _format_research_for_llm(self, results: dict) -> str:
|
||||||
|
"""Format research results for LLM consumption."""
|
||||||
|
output = []
|
||||||
|
|
||||||
|
# Query
|
||||||
|
output.append(f"# Research Query: {results.get('query', '')}")
|
||||||
|
output.append(f"Timestamp: {results.get('timestamp', '')}")
|
||||||
|
output.append("")
|
||||||
|
|
||||||
|
# Web results
|
||||||
|
web = results.get("web", [])
|
||||||
|
if web:
|
||||||
|
output.append("## Web Search Results")
|
||||||
|
for i, r in enumerate(web[:5], 1):
|
||||||
|
output.append(f"{i}. **{r.get('title', '')}**")
|
||||||
|
output.append(f" URL: {r.get('url', '')}")
|
||||||
|
output.append(f" {r.get('content', '')[:200]}...")
|
||||||
|
output.append("")
|
||||||
|
|
||||||
|
# Wikipedia
|
||||||
|
wiki = results.get("wikipedia", [])
|
||||||
|
if wiki:
|
||||||
|
output.append("## Wikipedia Results")
|
||||||
|
for r in wiki[:3]:
|
||||||
|
output.append(f"- {r.get('title', '')}: {r.get('summary', '')[:200]}...")
|
||||||
|
output.append("")
|
||||||
|
|
||||||
|
# Academic
|
||||||
|
academic = results.get("academic", [])
|
||||||
|
if academic:
|
||||||
|
output.append("## Academic Papers")
|
||||||
|
for r in academic[:5]:
|
||||||
|
output.append(f"- {r.get('title', '')} ({r.get('year', 'N/A')})")
|
||||||
|
output.append(f" {r.get('journal', '')}")
|
||||||
|
output.append("")
|
||||||
|
|
||||||
|
# Innovations
|
||||||
|
innovations = results.get("innovations", [])
|
||||||
|
if innovations:
|
||||||
|
output.append("## Innovations & New Ideas")
|
||||||
|
for i in innovations:
|
||||||
|
output.append(f"- {i}")
|
||||||
|
output.append("")
|
||||||
|
|
||||||
|
return "\n".join(output)
|
||||||
|
|
||||||
|
|
||||||
|
# Fact-checking with live verification
|
||||||
|
class VerifiedFactChecker:
|
||||||
|
"""Fact checker with live source verification."""
|
||||||
|
|
||||||
|
def __init__(self, search_provider: str = "tavily"):
|
||||||
|
"""Initialize verified fact checker."""
|
||||||
|
self.search = SearchTool(provider=search_provider)
|
||||||
|
self.wikipedia = WikipediaTool()
|
||||||
|
|
||||||
|
async def verify_claim(
|
||||||
|
self,
|
||||||
|
claim: str,
|
||||||
|
context: str = "",
|
||||||
|
) -> dict:
|
||||||
|
"""Verify a factual claim against live sources.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
claim: The claim to verify
|
||||||
|
context: Additional context
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Verification result with confidence and sources
|
||||||
|
"""
|
||||||
|
# Search for the claim
|
||||||
|
results = self.search.search(claim, num_results=5)
|
||||||
|
|
||||||
|
# Check Wikipedia
|
||||||
|
wiki_results = self.wikipedia.search(claim, num_results=2)
|
||||||
|
|
||||||
|
# Analyze
|
||||||
|
supporting = []
|
||||||
|
contradicting = []
|
||||||
|
neutral = []
|
||||||
|
|
||||||
|
for r in results:
|
||||||
|
content = r.get("content", "").lower()
|
||||||
|
claim_lower = claim.lower()
|
||||||
|
|
||||||
|
# Simple keyword matching
|
||||||
|
claim_words = set(claim_lower.split())
|
||||||
|
content_words = set(content.split())
|
||||||
|
overlap = claim_words & content_words
|
||||||
|
|
||||||
|
if len(overlap) > len(claim_words) * 0.7:
|
||||||
|
supporting.append(r)
|
||||||
|
elif "not" in content or "false" in content or "incorrect" in content:
|
||||||
|
contradicting.append(r)
|
||||||
|
else:
|
||||||
|
neutral.append(r)
|
||||||
|
|
||||||
|
# Calculate confidence
|
||||||
|
total = len(supporting) + len(contradicting) + len(neutral)
|
||||||
|
if total == 0:
|
||||||
|
confidence = 0.0
|
||||||
|
else:
|
||||||
|
confidence = len(supporting) / total
|
||||||
|
|
||||||
|
return {
|
||||||
|
"claim": claim,
|
||||||
|
"verified": len(supporting) > 0,
|
||||||
|
"confidence": confidence,
|
||||||
|
"supporting_sources": supporting,
|
||||||
|
"contradicting_sources": contradicting,
|
||||||
|
"neutral_sources": neutral,
|
||||||
|
"needs_citation": confidence < 0.8,
|
||||||
|
}
|
||||||
|
|
||||||
|
async def verify_batch(
|
||||||
|
self,
|
||||||
|
claims: list[str],
|
||||||
|
) -> list[dict]:
|
||||||
|
"""Verify multiple claims.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
claims: List of claims to verify
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of verification results
|
||||||
|
"""
|
||||||
|
results = []
|
||||||
|
for claim in claims:
|
||||||
|
result = await self.verify_claim(claim)
|
||||||
|
results.append(result)
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def create_research_agent(
|
||||||
|
search_provider: str = "tavily",
|
||||||
|
) -> ResearchAgent:
|
||||||
|
"""Factory to create a research agent.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
search_provider: Search provider
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Configured ResearchAgent
|
||||||
|
"""
|
||||||
|
return ResearchAgent(search_provider=search_provider)
|
||||||
@@ -5,6 +5,13 @@ from opus_orchestrator.utils.github_ingest import GitHubIngestor, create_github_
|
|||||||
from opus_orchestrator.utils.s3_ingest import S3Ingestor, create_s3_ingestor
|
from opus_orchestrator.utils.s3_ingest import S3Ingestor, create_s3_ingestor
|
||||||
from opus_orchestrator.utils.local_ingest import LocalIngestor, create_local_ingestor
|
from opus_orchestrator.utils.local_ingest import LocalIngestor, create_local_ingestor
|
||||||
from opus_orchestrator.utils.llm import get_llm_client
|
from opus_orchestrator.utils.llm import get_llm_client
|
||||||
|
from opus_orchestrator.utils.research import (
|
||||||
|
ResearchOrchestrator,
|
||||||
|
SearchTool,
|
||||||
|
WikipediaTool,
|
||||||
|
AcademicSearchTool,
|
||||||
|
create_research_orchestrator,
|
||||||
|
)
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"generate_docs",
|
"generate_docs",
|
||||||
@@ -15,4 +22,10 @@ __all__ = [
|
|||||||
"LocalIngestor",
|
"LocalIngestor",
|
||||||
"create_local_ingestor",
|
"create_local_ingestor",
|
||||||
"get_llm_client",
|
"get_llm_client",
|
||||||
|
# Research (NEW!)
|
||||||
|
"ResearchOrchestrator",
|
||||||
|
"SearchTool",
|
||||||
|
"WikipediaTool",
|
||||||
|
"AcademicSearchTool",
|
||||||
|
"create_research_orchestrator",
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -0,0 +1,496 @@
|
|||||||
|
"""Research tools for Opus Orchestrator.
|
||||||
|
|
||||||
|
Provides web search, database lookup, and research capabilities.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
from typing import Any, Optional, Callable
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
|
||||||
|
class SearchTool:
|
||||||
|
"""Web search tool using multiple backends."""
|
||||||
|
|
||||||
|
def __init__(self, provider: str = "tavily"):
|
||||||
|
"""Initialize search tool.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
provider: Search provider (tavily, serper, brave, duckduckgo)
|
||||||
|
"""
|
||||||
|
self.provider = provider
|
||||||
|
self._setup_provider()
|
||||||
|
|
||||||
|
def _setup_provider(self):
|
||||||
|
"""Set up the search provider."""
|
||||||
|
if self.provider == "tavily":
|
||||||
|
self.api_key = os.environ.get("TAVILY_API_KEY")
|
||||||
|
elif self.provider == "serper":
|
||||||
|
self.api_key = os.environ.get("SERPER_API_KEY")
|
||||||
|
elif self.provider == "brave":
|
||||||
|
self.api_key = os.environ.get("BRAVE_API_KEY")
|
||||||
|
|
||||||
|
def search(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
num_results: int = 10,
|
||||||
|
) -> list[dict]:
|
||||||
|
"""Search the web.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: Search query
|
||||||
|
num_results: Number of results to return
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of search results with title, url, snippet
|
||||||
|
"""
|
||||||
|
if self.provider == "tavily":
|
||||||
|
return self._search_tavily(query, num_results)
|
||||||
|
elif self.provider == "serper":
|
||||||
|
return self._search_serper(query, num_results)
|
||||||
|
elif self.provider == "brave":
|
||||||
|
return self._search_brave(query, num_results)
|
||||||
|
else:
|
||||||
|
return self._search_duckduckgo(query, num_results)
|
||||||
|
|
||||||
|
def _search_tavily(self, query: str, num_results: int) -> list[dict]:
|
||||||
|
"""Search using Tavily."""
|
||||||
|
try:
|
||||||
|
from tavily import TavilyClient
|
||||||
|
client = TavilyClient(api_key=self.api_key)
|
||||||
|
results = client.search(query=query, max_results=num_results)
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
"title": r.get("title", ""),
|
||||||
|
"url": r.get("url", ""),
|
||||||
|
"content": r.get("content", ""),
|
||||||
|
"score": r.get("score", 0),
|
||||||
|
}
|
||||||
|
for r in results.get("results", [])
|
||||||
|
]
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Tavily search error: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _search_serper(self, query: str, num_results: int) -> list[dict]:
|
||||||
|
"""Search using Serper."""
|
||||||
|
try:
|
||||||
|
headers = {
|
||||||
|
"X-API-KEY": self.api_key,
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
}
|
||||||
|
payload = {"q": query, "num": num_results}
|
||||||
|
response = requests.post(
|
||||||
|
"https://google.serper.dev/search",
|
||||||
|
headers=headers,
|
||||||
|
json=payload,
|
||||||
|
timeout=10,
|
||||||
|
)
|
||||||
|
data = response.json()
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
"title": r.get("title", ""),
|
||||||
|
"url": r.get("link", ""),
|
||||||
|
"content": r.get("snippet", ""),
|
||||||
|
"score": 1.0,
|
||||||
|
}
|
||||||
|
for r in data.get("organic", [])
|
||||||
|
]
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Serper search error: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _search_brave(self, query: str, num_results: int) -> list[dict]:
|
||||||
|
"""Search using Brave."""
|
||||||
|
try:
|
||||||
|
headers = {"Accept": "application/json", "X-Subscription-Token": self.api_key}
|
||||||
|
response = requests.get(
|
||||||
|
"https://api.search.brave.com/res/v1/web/search",
|
||||||
|
params={"q": query, "count": num_results},
|
||||||
|
headers=headers,
|
||||||
|
timeout=10,
|
||||||
|
)
|
||||||
|
data = response.json()
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
"title": r.get("title", ""),
|
||||||
|
"url": r.get("url", ""),
|
||||||
|
"content": r.get("description", ""),
|
||||||
|
"score": r.get("score", 0),
|
||||||
|
}
|
||||||
|
for r in data.get("web", {}).get("results", [])
|
||||||
|
]
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Brave search error: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _search_duckduckgo(self, query: str, num_results: int) -> list[dict]:
|
||||||
|
"""Search using DuckDuckGo (no API key needed)."""
|
||||||
|
try:
|
||||||
|
from duckduckgo_search import DDGS
|
||||||
|
results = DDGS().text(query, max_results=num_results)
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
"title": r.get("title", ""),
|
||||||
|
"url": r.get("href", ""),
|
||||||
|
"content": r.get("body", ""),
|
||||||
|
"score": 1.0,
|
||||||
|
}
|
||||||
|
for r in results
|
||||||
|
]
|
||||||
|
except Exception as e:
|
||||||
|
print(f"DuckDuckGo search error: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
class WikipediaTool:
|
||||||
|
"""Wikipedia lookup tool."""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
"""Initialize Wikipedia tool."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def search(self, query: str, num_results: int = 5) -> list[dict]:
|
||||||
|
"""Search Wikipedia.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: Search query
|
||||||
|
num_results: Number of results
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of Wikipedia articles
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
import wikipedia
|
||||||
|
results = wikipedia.search(query, results=num_results)
|
||||||
|
articles = []
|
||||||
|
for title in results:
|
||||||
|
try:
|
||||||
|
page = wikipedia.page(title)
|
||||||
|
articles.append({
|
||||||
|
"title": page.title,
|
||||||
|
"url": page.url,
|
||||||
|
"summary": page.summary[:500],
|
||||||
|
"content": page.content[:2000],
|
||||||
|
})
|
||||||
|
except:
|
||||||
|
continue
|
||||||
|
return articles
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Wikipedia search error: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def get_article(self, title: str) -> dict:
|
||||||
|
"""Get a Wikipedia article by title.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
title: Article title
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Article content
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
import wikipedia
|
||||||
|
page = wikipedia.page(title)
|
||||||
|
return {
|
||||||
|
"title": page.title,
|
||||||
|
"url": page.url,
|
||||||
|
"summary": page.summary,
|
||||||
|
"content": page.content[:5000],
|
||||||
|
"references": page.references[:10] if hasattr(page, "references") else [],
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
return {"error": str(e)}
|
||||||
|
|
||||||
|
|
||||||
|
class ArxivTool:
|
||||||
|
"""ArXiv paper search tool."""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
"""Initialize ArXiv tool."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def search(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
max_results: int = 10,
|
||||||
|
categories: list[str] = None,
|
||||||
|
) -> list[dict]:
|
||||||
|
"""Search ArXiv for papers.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: Search query
|
||||||
|
max_results: Max results
|
||||||
|
categories: ArXiv categories to filter
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of papers
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
import arxiv
|
||||||
|
client = arxiv.Client()
|
||||||
|
search = arxiv.Search(
|
||||||
|
query=query,
|
||||||
|
max_results=max_results,
|
||||||
|
categories=categories or [],
|
||||||
|
)
|
||||||
|
papers = []
|
||||||
|
for result in client.results(search):
|
||||||
|
papers.append({
|
||||||
|
"title": result.title,
|
||||||
|
"url": result.entry_id,
|
||||||
|
"abstract": result.summary[:1000],
|
||||||
|
"authors": [a.name for a in result.authors],
|
||||||
|
"published": str(result.published.date()),
|
||||||
|
"categories": result.categories,
|
||||||
|
})
|
||||||
|
return papers
|
||||||
|
except Exception as e:
|
||||||
|
print(f"ArXiv search error: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
class AcademicSearchTool:
|
||||||
|
"""Academic paper search (CrossRef, Semantic Scholar)."""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
"""Initialize academic search tool."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def search_crossref(self, query: str, max_results: int = 10) -> list[dict]:
|
||||||
|
"""Search CrossRef for academic papers."""
|
||||||
|
try:
|
||||||
|
url = "https://api.crossref.org/works"
|
||||||
|
params = {"query": query, "rows": max_results}
|
||||||
|
response = requests.get(url, params=params, timeout=10)
|
||||||
|
data = response.json()
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
"title": item.get("title", [""])[0],
|
||||||
|
"url": item.get("URL", ""),
|
||||||
|
"authors": [a.get("given", "") + " " + a.get("family", "")
|
||||||
|
for a in item.get("author", [])],
|
||||||
|
"year": item.get("created", {}).get("date-parts", [[None]])[0][0],
|
||||||
|
"journal": item.get("container-title", [""])[0],
|
||||||
|
"doi": item.get("DOI", ""),
|
||||||
|
}
|
||||||
|
for item in data.get("message", {}).get("items", [])
|
||||||
|
]
|
||||||
|
except Exception as e:
|
||||||
|
print(f"CrossRef search error: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def search_semantic_scholar(self, query: str, max_results: int = 10) -> list[dict]:
|
||||||
|
"""Search Semantic Scholar for papers."""
|
||||||
|
try:
|
||||||
|
url = "https://api.semanticscholar.org/graph/v1/paper/search"
|
||||||
|
params = {
|
||||||
|
"query": query,
|
||||||
|
"limit": max_results,
|
||||||
|
"fields": "title,url,abstract,authors,year,citationCount",
|
||||||
|
}
|
||||||
|
response = requests.get(url, params=params, timeout=10)
|
||||||
|
data = response.json()
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
"title": p.get("title", ""),
|
||||||
|
"url": p.get("url", ""),
|
||||||
|
"abstract": p.get("abstract", "")[:500],
|
||||||
|
"authors": [a.get("name", "") for a in p.get("authors", [])[:5]],
|
||||||
|
"year": p.get("year"),
|
||||||
|
"citations": p.get("citationCount", 0),
|
||||||
|
}
|
||||||
|
for p in data.get("data", [])
|
||||||
|
]
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Semantic Scholar search error: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
class ResearchOrchestrator:
|
||||||
|
"""Orchestrates research across multiple tools."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
search_provider: str = "tavily",
|
||||||
|
use_wikipedia: bool = True,
|
||||||
|
use_academic: bool = True,
|
||||||
|
):
|
||||||
|
"""Initialize research orchestrator.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
search_provider: Search provider to use
|
||||||
|
use_wikipedia: Include Wikipedia
|
||||||
|
use_academic: Include academic search
|
||||||
|
"""
|
||||||
|
self.search = SearchTool(provider=search_provider)
|
||||||
|
self.wikipedia = WikipediaTool() if use_wikipedia else None
|
||||||
|
self.academic = AcademicSearchTool() if use_academic else None
|
||||||
|
|
||||||
|
def comprehensive_search(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
include_web: bool = True,
|
||||||
|
include_wikipedia: bool = True,
|
||||||
|
include_academic: bool = True,
|
||||||
|
) -> dict:
|
||||||
|
"""Run comprehensive research across all sources.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: Research query
|
||||||
|
include_web: Include web search
|
||||||
|
include_wikipedia: Include Wikipedia
|
||||||
|
include_academic: Include academic papers
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Combined research results
|
||||||
|
"""
|
||||||
|
results = {
|
||||||
|
"query": query,
|
||||||
|
"timestamp": datetime.now().isoformat(),
|
||||||
|
"web": [],
|
||||||
|
"wikipedia": [],
|
||||||
|
"academic": [],
|
||||||
|
"innovations": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
# Web search
|
||||||
|
if include_web:
|
||||||
|
results["web"] = self.search.search(query, num_results=10)
|
||||||
|
|
||||||
|
# Wikipedia
|
||||||
|
if self.wikipedia and include_wikipedia:
|
||||||
|
results["wikipedia"] = self.wikipedia.search(query, num_results=5)
|
||||||
|
|
||||||
|
# Academic
|
||||||
|
if self.academic and include_academic:
|
||||||
|
results["academic"] = self.academic.search_crossref(query, max_results=5)
|
||||||
|
results["academic"].extend(
|
||||||
|
self.academic.search_semantic_scholar(query, max_results=5)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Generate innovations from research
|
||||||
|
results["innovations"] = self._generate_innovations(results)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
def _generate_innovations(self, research: dict) -> list[str]:
|
||||||
|
"""Generate innovative ideas from research.
|
||||||
|
|
||||||
|
This analyzes the gathered information to spawn new ideas
|
||||||
|
and connections beyond the original training data.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
research: Combined research results
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of innovative ideas/connections
|
||||||
|
"""
|
||||||
|
innovations = []
|
||||||
|
|
||||||
|
# Analyze web results for emerging trends
|
||||||
|
web_content = " ".join([
|
||||||
|
r.get("content", "")[:200] for r in research.get("web", [])[:5]
|
||||||
|
])
|
||||||
|
|
||||||
|
# Analyze academic for research gaps
|
||||||
|
academic_titles = [a.get("title", "") for a in research.get("academic", [])[:5]]
|
||||||
|
|
||||||
|
# Look for intersections
|
||||||
|
if web_content and academic_titles:
|
||||||
|
innovations.append(
|
||||||
|
"Cross-disciplinary connection: Apply web trends to academic findings"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add research gaps identification
|
||||||
|
if len(research.get("academic", [])) < 3:
|
||||||
|
innovations.append(
|
||||||
|
"Research gap: Limited academic coverage - original contribution opportunity"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add timestamp for freshness
|
||||||
|
innovations.append(
|
||||||
|
f"Research timestamp: {research.get('timestamp')} - ensures current information"
|
||||||
|
)
|
||||||
|
|
||||||
|
return innovations
|
||||||
|
|
||||||
|
def deep_research(
|
||||||
|
self,
|
||||||
|
topic: str,
|
||||||
|
subtopics: list[str] = None,
|
||||||
|
) -> dict:
|
||||||
|
"""Perform deep research on a topic and its subtopics.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
topic: Main topic
|
||||||
|
subtopics: Related subtopics to research
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Deep research results
|
||||||
|
"""
|
||||||
|
results = {
|
||||||
|
"main_topic": topic,
|
||||||
|
"main_research": self.comprehensive_search(topic),
|
||||||
|
"subtopic_research": {},
|
||||||
|
}
|
||||||
|
|
||||||
|
# Research each subtopic
|
||||||
|
for subtopic in (subtopics or []):
|
||||||
|
combined = f"{topic}: {subtopic}"
|
||||||
|
results["subtopic_research"][subtopic] = self.comprehensive_search(combined)
|
||||||
|
|
||||||
|
# Cross-reference all findings
|
||||||
|
results["cross_references"] = self._cross_reference(results)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
def _cross_reference(self, deep_results: dict) -> list[str]:
|
||||||
|
"""Find cross-references between main and subtopic research."""
|
||||||
|
refs = []
|
||||||
|
|
||||||
|
main_content = " ".join([
|
||||||
|
r.get("content", "")[:300]
|
||||||
|
for r in deep_results.get("main_research", {}).get("web", [])[:3]
|
||||||
|
])
|
||||||
|
|
||||||
|
for subtopic, sub_data in deep_results.get("subtopic_research", {}).items():
|
||||||
|
sub_content = " ".join([
|
||||||
|
r.get("content", "")[:300]
|
||||||
|
for r in sub_data.get("web", [])[:3]
|
||||||
|
])
|
||||||
|
|
||||||
|
# Look for connections
|
||||||
|
if main_content and sub_content:
|
||||||
|
common_words = set(main_content.lower().split()) & set(sub_content.lower().split())
|
||||||
|
if len(common_words) > 10:
|
||||||
|
refs.append(f"Connection found: {subtopic} relates to main topic via {len(common_words)} shared concepts")
|
||||||
|
|
||||||
|
return refs
|
||||||
|
|
||||||
|
|
||||||
|
def create_research_orchestrator(
|
||||||
|
search_provider: str = "tavily",
|
||||||
|
use_wikipedia: bool = True,
|
||||||
|
use_academic: bool = True,
|
||||||
|
) -> ResearchOrchestrator:
|
||||||
|
"""Factory function to create research orchestrator.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
search_provider: Search provider
|
||||||
|
use_wikipedia: Include Wikipedia
|
||||||
|
use_academic: Include academic search
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Configured ResearchOrchestrator
|
||||||
|
"""
|
||||||
|
return ResearchOrchestrator(
|
||||||
|
search_provider=search_provider,
|
||||||
|
use_wikipedia=use_wikipedia,
|
||||||
|
use_academic=use_academic,
|
||||||
|
)
|
||||||
@@ -30,6 +30,11 @@ dependencies = [
|
|||||||
"tiktoken>=0.7.0",
|
"tiktoken>=0.7.0",
|
||||||
"markdown>=3.7",
|
"markdown>=3.7",
|
||||||
"python-dotenv>=1.0.0",
|
"python-dotenv>=1.0.0",
|
||||||
|
# Research dependencies (NEW!)
|
||||||
|
"tavily>=0.3.0",
|
||||||
|
"wikipedia>=1.4.0",
|
||||||
|
"arxiv>=1.4.0",
|
||||||
|
"duckduckgo-search>=7.0.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.optional-dependencies]
|
[project.optional-dependencies]
|
||||||
|
|||||||
Reference in New Issue
Block a user