diff --git a/opus_orchestrator/utils/agent_crawler.py b/opus_orchestrator/utils/agent_crawler.py new file mode 100644 index 0000000..665356e --- /dev/null +++ b/opus_orchestrator/utils/agent_crawler.py @@ -0,0 +1,391 @@ +"""Agent-Driven Web Crawler for Opus. + +Uses AI to analyze sites, decide what to crawl, and intelligently extract content. +Instead of hardcoded patterns, the agent understands context and adapts. +""" + +from dataclasses import dataclass, field +from typing import Optional, list +from enum import Enum +import asyncio + + +class CrawlPurpose(str, Enum): + """What the user intends to do with the crawled content.""" + DOCUMENTATION = "documentation" # Technical docs, guides + TRAINING = "training" # Learning materials + KNOWLEDGE = "knowledge" # General knowledge base + RESEARCH = "research" # Research papers, articles + REFERENCE = "reference" # Reference material + + +@dataclass +class PageResult: + """A single crawled page.""" + url: str + title: str + content: str + relevance_score: float + links: list[str] + depth: int + + +@dataclass +class SiteAnalysis: + """Agent's analysis of a site.""" + site_type: str # documentation, blog, wiki, etc. + sections: dict # section -> priority + suggested_urls: list[str] + skip_patterns: list[str] + reasoning: str + + +@dataclass +class AgentCrawlResult: + """Result from agent crawling.""" + pages: list[PageResult] + site_analysis: SiteAnalysis + total_fetched: int + total_relevant: int + duration_seconds: float + + +class AgentWebCrawler: + """AI-powered web crawler that uses an agent to decide what to crawl. + + Instead of hardcoded patterns, the agent: + 1. Analyzes the site structure + 2. Decides what matters for the purpose + 3. Adapts as it learns more + 4. Knows when it has enough + """ + + def __init__( + self, + llm_client=None, + max_pages: int = 50, + max_depth: int = 3, + delay_seconds: float = 1.0, + user_agent: str = "OpusCrawler/1.0", + ): + self.llm = llm_client + self.max_pages = max_pages + self.max_depth = max_depth + self.delay = delay_seconds + self.user_agent = user_agent + self._fetched_urls = set() + + async def crawl( + self, + start_url: str, + purpose: CrawlPurpose = CrawlPurpose.DOCUMENTATION, + ) -> AgentCrawlResult: + """Crawl a site using AI to decide what matters. + + Args: + start_url: Where to begin + purpose: What the content is for + + Returns: + AgentCrawlResult with pages and analysis + """ + import time + start_time = time.time() + + # Step 1: Analyze the site + analysis = await self._analyze_site(start_url, purpose) + + # Step 2: Decide what to fetch (agent reasoning) + urls_to_fetch = await self._decide_urls(analysis, purpose) + + # Step 3: Fetch in priority order with relevance scoring + pages = [] + fetched_count = 0 + + for url_info in urls_to_fetch: + if fetched_count >= self.max_pages: + break + + try: + page = await self._fetch_and_analyze( + url_info["url"], + url_info["priority"], + purpose, + ) + + if page.relevance_score > 0.3: # Threshold + pages.append(page) + fetched_count += 1 + + # Be nice + await asyncio.sleep(self.delay) + + except Exception as e: + print(f"Failed to fetch {url_info['url']}: {e}") + continue + + duration = time.time() - start_time + + return AgentCrawlResult( + pages=pages, + site_analysis=analysis, + total_fetched=fetched_count, + total_relevant=len(pages), + duration_seconds=duration, + ) + + async def _analyze_site(self, start_url: str, purpose: CrawlPurpose) -> SiteAnalysis: + """Use agent to analyze the site structure.""" + + # Fetch homepage + homepage_content = await self._fetch(start_url) + + if self.llm: + # Use LLM to analyze + prompt = f"""Analyze this website for crawling. + +URL: {start_url} +Purpose: {purpose.value} + +Analyze: +1. What type of site is this? (documentation, blog, wiki, etc.) +2. What are the main sections? +3. Which URLs should we prioritize for {purpose} content? +4. What should we skip? +5. What patterns in URLs matter? + +Homepage content: +{homepage_content[:3000]} +""" + response = await self.llm.complete( + system_prompt="You are a web crawler expert. Analyze sites to determine what to crawl.", + user_prompt=prompt, + ) + + # Parse response into SiteAnalysis + return self._parse_analysis(start_url, response) + else: + # Fallback: simple heuristics + return self._simple_analysis(start_url, purpose) + + async def _decide_urls( + self, + analysis: SiteAnalysis, + purpose: CrawlPurpose, + ) -> list[dict]: + """Agent decides which URLs to fetch.""" + + urls = [] + + # Start with suggested URLs + for url in analysis.suggested_urls[:self.max_pages]: + priority = analysis.sections.get(url, 0.5) + urls.append({"url": url, "priority": priority}) + + return urls[:self.max_pages] + + async def _fetch_and_analyze( + self, + url: str, + base_priority: float, + purpose: CrawlPurpose, + ) -> PageResult: + """Fetch a page and analyze its relevance.""" + + content = await self._fetch(url) + + # Extract content + title = self._extract_title(content) + links = self._extract_links(content, url) + + # Extract main content (not HTML) + main_content = self._extract_main_content(content) + + # Score relevance + if self.llm: + relevance = await self._score_relevance(main_content, purpose) + else: + relevance = self._simple_relevance(main_content, purpose) + + return PageResult( + url=url, + title=title, + content=main_content, + relevance_score=relevance, + links=links, + depth=0, + ) + + async def _fetch(self, url: str) -> str: + """Fetch a URL (using requests or similar).""" + import requests + + headers = {"User-Agent": self.user_agent} + response = requests.get(url, headers=headers, timeout=30) + response.raise_for_status() + return response.text + + def _extract_title(self, html: str) -> str: + """Extract page title.""" + import re + match = re.search(r'