# robots.txt — Blackthorn # https://blackthorn.io # Updated: March 26, 2026 # # AI & LLM guidance: /llms.txt # Sitemap: https://blackthorn.io/sitemap_index.xml # # Policy: # - Standard search engines: allowed with WP path restrictions # - AI search assistants (retrieval/answering): allowed # - AI training crawlers (dataset collection): disallowed # ============================================================ # All crawlers — block WordPress internals and sensitive paths # ============================================================ User-agent: * Disallow: /wp-admin/ Allow: /wp-admin/admin-ajax.php Disallow: /wp-login.php Disallow: /wp-json/ Disallow: /xmlrpc.php Disallow: /trackback/ Disallow: /feed/ Disallow: /?s= Disallow: /search/ Disallow: /cgi-bin/ # Sensitive wp-content directories Disallow: /wp-content/debug.log Disallow: /wp-content/upgrade/ Disallow: /wp-content/updraft/ Disallow: /wp-content/ai1wm-backups/ Disallow: /wp-content/litespeed/ # Nelio A/B testing experiment previews Disallow: /?nab= # Author pages — username enumeration risk + duplicate content Disallow: /author/ Disallow: /?author= # Comment reply URLs — duplicate content / crawl budget Disallow: /?replytocom= # Crawl budget — staging/utility paths Disallow: /local-xdebuginfo.php Disallow: /readme.html Disallow: /license.txt Disallow: /wp-activate.php Disallow: /wp-signup.php Sitemap: https://blackthorn.io/sitemap_index.xml # ============================================================ # Conventional Search Engines (governed by * block above) # Googlebot, DuckDuckBot, Slurp, YandexBot are allowed to # crawl all public content; WP path restrictions above apply. # Note: Google AI Overviews uses Googlebot (not Google-Extended), # so content is eligible for AI Overviews via the * block. # ============================================================ # ============================================================ # SEO Tools (analytics + auditing — ALLOWED) # ============================================================ # Semrush — SEO crawling and site audits User-agent: SemrushBot Allow: / Crawl-delay: 1 User-agent: SemrushBot-SA Allow: / Crawl-delay: 1 User-agent: SemrushBot-SI Allow: / Crawl-delay: 1 # ============================================================ # AI Search Assistants (retrieval + answering — ALLOWED) # These bots surface content in AI-powered search results. # ============================================================ User-agent: ChatGPT-User Allow: / User-agent: PerplexityBot Allow: / User-agent: YouBot Allow: / User-agent: Applebot Allow: / User-agent: Applebot-Extended Allow: / # Bing powers Microsoft Copilot web grounding User-agent: bingbot Allow: / # OpenAI — search indexing for ChatGPT web search (distinct from GPTBot training) User-agent: OAI-SearchBot Allow: / # xAI — Grok AI search assistant User-agent: Grok Allow: / # DuckDuckGo — AI assistant User-agent: DuckAssistBot Allow: / # ============================================================ # AI Training Crawlers (dataset collection — DISALLOWED) # These bots harvest content to train foundation models. # ============================================================ # OpenAI — training (ChatGPT-User above handles browsing/retrieval) User-agent: GPTBot Disallow: / # Google — Gemini / AI Overviews training User-agent: Google-Extended Disallow: / # Common Crawl — primary dataset source for many LLMs User-agent: CCBot Disallow: / # Anthropic User-agent: anthropic-ai Disallow: / User-agent: Claude-Web Disallow: / # Cohere User-agent: cohere-ai Disallow: / # Diffbot — knowledge graph / AI data extraction User-agent: Diffbot Disallow: / # ByteDance / TikTok User-agent: Bytespider Disallow: / # Meta AI training User-agent: Meta-ExternalAgent Disallow: / User-agent: Meta-ExternalFetcher Disallow: / # Amazon Alexa AI User-agent: Amazonbot Disallow: / # Omgili / Webz.io — media monitoring datasets User-agent: omgili Disallow: / User-agent: omgilibot Disallow: / # Timpi — decentralized web index for AI User-agent: Timpibot Disallow: / # ImageSift — visual AI training User-agent: ImagesiftBot Disallow: / # DataForSEO — bulk data reseller / AI datasets User-agent: DataForSeoBot Disallow: / # PetalBot — Huawei/Petal Search AI training User-agent: PetalBot Disallow: /