"""Agent-Driven Web Crawler for Opus. Uses AI to analyze sites, decide what to crawl, and intelligently extract content. Instead of hardcoded patterns, the agent understands context and adapts. """ from dataclasses import dataclass, field from typing import Optional, list from enum import Enum import asyncio class CrawlPurpose(str, Enum): """What the user intends to do with the crawled content.""" DOCUMENTATION = "documentation" # Technical docs, guides TRAINING = "training" # Learning materials KNOWLEDGE = "knowledge" # General knowledge base RESEARCH = "research" # Research papers, articles REFERENCE = "reference" # Reference material @dataclass class PageResult: """A single crawled page.""" url: str title: str content: str relevance_score: float links: list[str] depth: int @dataclass class SiteAnalysis: """Agent's analysis of a site.""" site_type: str # documentation, blog, wiki, etc. sections: dict # section -> priority suggested_urls: list[str] skip_patterns: list[str] reasoning: str @dataclass class AgentCrawlResult: """Result from agent crawling.""" pages: list[PageResult] site_analysis: SiteAnalysis total_fetched: int total_relevant: int duration_seconds: float class AgentWebCrawler: """AI-powered web crawler that uses an agent to decide what to crawl. Instead of hardcoded patterns, the agent: 1. Analyzes the site structure 2. Decides what matters for the purpose 3. Adapts as it learns more 4. Knows when it has enough """ def __init__( self, llm_client=None, max_pages: int = 50, max_depth: int = 3, delay_seconds: float = 1.0, user_agent: str = "OpusCrawler/1.0", ): self.llm = llm_client self.max_pages = max_pages self.max_depth = max_depth self.delay = delay_seconds self.user_agent = user_agent self._fetched_urls = set() async def crawl( self, start_url: str, purpose: CrawlPurpose = CrawlPurpose.DOCUMENTATION, ) -> AgentCrawlResult: """Crawl a site using AI to decide what matters. Args: start_url: Where to begin purpose: What the content is for Returns: AgentCrawlResult with pages and analysis """ import time start_time = time.time() # Step 1: Analyze the site analysis = await self._analyze_site(start_url, purpose) # Step 2: Decide what to fetch (agent reasoning) urls_to_fetch = await self._decide_urls(analysis, purpose) # Step 3: Fetch in priority order with relevance scoring pages = [] fetched_count = 0 for url_info in urls_to_fetch: if fetched_count >= self.max_pages: break try: page = await self._fetch_and_analyze( url_info["url"], url_info["priority"], purpose, ) if page.relevance_score > 0.3: # Threshold pages.append(page) fetched_count += 1 # Be nice await asyncio.sleep(self.delay) except Exception as e: print(f"Failed to fetch {url_info['url']}: {e}") continue duration = time.time() - start_time return AgentCrawlResult( pages=pages, site_analysis=analysis, total_fetched=fetched_count, total_relevant=len(pages), duration_seconds=duration, ) async def _analyze_site(self, start_url: str, purpose: CrawlPurpose) -> SiteAnalysis: """Use agent to analyze the site structure.""" # Fetch homepage homepage_content = await self._fetch(start_url) if self.llm: # Use LLM to analyze prompt = f"""Analyze this website for crawling. URL: {start_url} Purpose: {purpose.value} Analyze: 1. What type of site is this? (documentation, blog, wiki, etc.) 2. What are the main sections? 3. Which URLs should we prioritize for {purpose} content? 4. What should we skip? 5. What patterns in URLs matter? Homepage content: {homepage_content[:3000]} """ response = await self.llm.complete( system_prompt="You are a web crawler expert. Analyze sites to determine what to crawl.", user_prompt=prompt, ) # Parse response into SiteAnalysis return self._parse_analysis(start_url, response) else: # Fallback: simple heuristics return self._simple_analysis(start_url, purpose) async def _decide_urls( self, analysis: SiteAnalysis, purpose: CrawlPurpose, ) -> list[dict]: """Agent decides which URLs to fetch.""" urls = [] # Start with suggested URLs for url in analysis.suggested_urls[:self.max_pages]: priority = analysis.sections.get(url, 0.5) urls.append({"url": url, "priority": priority}) return urls[:self.max_pages] async def _fetch_and_analyze( self, url: str, base_priority: float, purpose: CrawlPurpose, ) -> PageResult: """Fetch a page and analyze its relevance.""" content = await self._fetch(url) # Extract content title = self._extract_title(content) links = self._extract_links(content, url) # Extract main content (not HTML) main_content = self._extract_main_content(content) # Score relevance if self.llm: relevance = await self._score_relevance(main_content, purpose) else: relevance = self._simple_relevance(main_content, purpose) return PageResult( url=url, title=title, content=main_content, relevance_score=relevance, links=links, depth=0, ) async def _fetch(self, url: str) -> str: """Fetch a URL (using requests or similar).""" import requests headers = {"User-Agent": self.user_agent} response = requests.get(url, headers=headers, timeout=30) response.raise_for_status() return response.text def _extract_title(self, html: str) -> str: """Extract page title.""" import re match = re.search(r'