# robots.txt - AI Bot & Crawler Management # Strategy: Allow search engines and AI search (brings human traffic), throttle AI training crawlers # ============================================================================= # SEARCH ENGINES - Full Access (Bring Human SEO Traffic) # ============================================================================= User-agent: Googlebot Allow: / # Crawl-delay: 0.1 # Block IPv6 wildcard search URLs — thin content (near-identical template pages) # Disallow: /ip/search/ # Disallow: /ip/ip-to-location/ # Disallow: /seo/keyword-tool/ User-agent: Bingbot Allow: / Crawl-delay: 0.5 User-agent: Baiduspider Allow: / Crawl-delay: 2 # ============================================================================= # AI SEARCH ENGINES - Allowed with Moderate Delay (Bring Human Referrals) # These can drive traffic when users ask questions and get linked to your site # ============================================================================= User-agent: Applebot # Apple AI Search / Siri Allow: / Crawl-delay: 3 User-agent: PerplexityBot # Perplexity AI Search Allow: / Crawl-delay: 3 User-agent: OAI-SearchBot # OpenAI SearchGPT Allow: / Crawl-delay: 3 User-agent: DuckAssistBot # DuckDuckGo AI Assistant Allow: / Crawl-delay: 3 # ============================================================================= # AI ASSISTANTS - Allowed with Delay (Direct User Queries) # These are users asking questions through ChatGPT, Claude, etc. # ============================================================================= User-agent: ChatGPT-User # ChatGPT web browsing Allow: / Crawl-delay: 2 User-agent: Perplexity-User # Perplexity user queries Allow: / Crawl-delay: 2 User-agent: Claude-User # Claude user queries Allow: / Crawl-delay: 2 User-agent: Meta-ExternalFetcher # Meta AI user queries Allow: / Crawl-delay: 2 User-agent: MistralAI-User # Mistral user queries Allow: / Crawl-delay: 2 # ============================================================================= # SEO SCRAPERS - HEAVILY THROTTLED (No Human Traffic Value) # These bots scrape for SEO/analytics tools - no direct visitor referrals # ============================================================================= User-agent: SemrushBot # Semrush SEO tool crawler Crawl-delay: 60 # Very slow - purely for their commercial SEO tool User-agent: DotBot # Moz/OpenSiteExplorer Crawl-delay: 60 # Very slow - SEO link analysis only User-agent: AhrefsBot # Ahrefs SEO tool Crawl-delay: 60 # Very slow - backlink analysis tool User-agent: MJ12bot # Majestic SEO crawler Crawl-delay: 60 # Very slow - link intelligence User-agent: DataForSeoBot # DataForSEO API crawler Crawl-delay: 60 User-agent: BLEXBot # WebMeUp backlink crawler Crawl-delay: 60 User-agent: seoscanners # Generic SEO scanner Crawl-delay: 60 User-agent: SiteAuditBot # SEMrush Site Audit Crawl-delay: 60 User-agent: SEOkicks # SEOkicks link research Crawl-delay: 60 # ============================================================================= # AI TRAINING CRAWLERS - BLOCKED or HEAVILY THROTTLED (No Human Traffic Value) # These only consume resources for AI model training without bringing users # ============================================================================= User-agent: Bytespider # ByteDance/TikTok - 540k requests! Crawl-delay: 30 # Heavy throttle instead of block (may still index) User-agent: GPTBot # OpenAI training crawler Allow: / # Allowed — monetized via Tollbit Crawl-delay: 10 User-agent: Amazonbot # Amazon AI training Crawl-delay: 20 User-agent: PetalBot # Huawei AI training Crawl-delay: 20 User-agent: ClaudeBot # Anthropic training crawler Allow: / # Allowed — monetized via Tollbit Crawl-delay: 10 User-agent: Claude-SearchBot # Anthropic search training Allow: / # Allowed — monetized via Tollbit Crawl-delay: 10 User-agent: CCBot # Common Crawl (used by many AI companies) Allow: / # Allowed — monetized via Tollbit Crawl-delay: 15 User-agent: FacebookBot # Meta training crawler Crawl-delay: 20 User-agent: Meta-ExternalAgent # Meta general crawler Crawl-delay: 15 User-agent: Google-CloudVertexBot # Google AI training Allow: / # Allowed — monetized via Tollbit Crawl-delay: 10 User-agent: Google-Extended # Google AI training Allow: / # Allowed — monetized via Tollbit Crawl-delay: 10 User-agent: anthropic-ai # Anthropic variants Allow: / # Allowed — monetized via Tollbit Crawl-delay: 10 User-agent: cohere-ai # Cohere AI Allow: / # Allowed — monetized via Tollbit Crawl-delay: 10 User-agent: Diffbot # Diffbot crawler Crawl-delay: 20 User-agent: ImagesiftBot # Image training Disallow: / User-agent: Omgilibot # Omgili crawler Disallow: / User-agent: Omgili # Omgili variant Disallow: / User-agent: YouBot # You.com crawler Crawl-delay: 10 # ============================================================================= # ARCHIVAL & RESEARCH - Allowed with Delay # ============================================================================= User-agent: archive.org_bot # Internet Archive - Wayback Machine Allow: / Crawl-delay: 5 User-agent: ia_archiver # Internet Archive variant Allow: / Crawl-delay: 5 # ============================================================================= # MISCELLANEOUS AI CRAWLERS - BLOCKED # ============================================================================= User-agent: Anchor Browser # Anchor AI Disallow: / User-agent: Timpibot # Timpi search Crawl-delay: 15 User-agent: ProRataInc # ProRata.ai Disallow: / User-agent: Novellum AI Crawl # Novellum Disallow: / # ============================================================================= # DEFAULT RULE - Moderate Crawl Delay for Unlisted Bots # ============================================================================= User-agent: * Crawl-delay: 2 Disallow: /dist/ Disallow: /wwwroot/ Disallow: /*.js$ Disallow: /*.css$ Disallow: /bin/ Disallow: /obj/ # Disallow: /ip/search/ # Disallow: /ip/ip-to-location/ # Disallow: /seo/keyword-tool/ # ============================================================================= # SITEMAP - Help Search Engines Index Your Content # ============================================================================= Sitemap: https://tejji.com/sitemap.xml