# Behave Health - behavehealth.com
# Policy: Block AI training bots. Allow AI search/retrieval bots.
# Last updated: 2026-02-26

# =============================================
# DEFAULT: Allow all traditional search engines
# (Googlebot, Bingbot, DuckDuckBot, etc.)
# =============================================
User-agent: *
Allow: /

# =============================================
# ANTHROPIC — Three-Tier Crawler Controls
# Docs: https://docs.anthropic.com/en/docs/about-claude/web-crawlers
# =============================================

# ClaudeBot — training data collection (BLOCK)
User-agent: ClaudeBot
Disallow: /

# Claude-SearchBot — search index crawling (ALLOW)
# Allows behavehealth.com to appear in Claude search results
User-agent: Claude-SearchBot
Allow: /

# Claude-User — live user-initiated retrieval (ALLOW)
# Allows users to ask Claude to fetch/summarize our pages in real time
User-agent: Claude-User
Allow: /

# =============================================
# OPENAI — Three-Tier Crawler Controls
# Docs: https://platform.openai.com/docs/bots
# =============================================

# GPTBot — training data collection (BLOCK)
User-agent: GPTBot
Disallow: /

# OAI-SearchBot — search index crawling (ALLOW)
# Allows behavehealth.com to appear in ChatGPT search results
User-agent: OAI-SearchBot
Allow: /

# ChatGPT-User — live user-initiated retrieval (ALLOW)
# Allows users to ask ChatGPT to fetch/summarize our pages in real time
User-agent: ChatGPT-User
Allow: /

# =============================================
# OTHER AI TRAINING BOTS (BLOCK)
# =============================================

# Google AI training (Gemini) — blocks training, not regular Google Search
User-agent: Google-Extended
Disallow: /

# Apple AI training (Apple Intelligence) — blocks training, not regular Siri/Spotlight
User-agent: Applebot-Extended
Disallow: /

# Common Crawl (used by many AI labs for training data)
User-agent: CCBot
Disallow: /

# ByteDance / TikTok AI training
User-agent: Bytespider
Disallow: /

# Meta AI training
User-agent: FacebookBot
Disallow: /

# Perplexity AI (training + search — no separate search-only bot exists)
User-agent: PerplexityBot
Disallow: /

# Amazon AI training
User-agent: Amazonbot
Disallow: /

# Cohere AI training
User-agent: cohere-ai
Disallow: /

# =============================================
# APPLICATION SUBDOMAINS (should not be indexed)
# Note: robots.txt is domain-scoped. For full
# enforcement, deploy robots.txt on each subdomain.
# Subdomains: portal, app, hq, sales
# =============================================
User-agent: *
Disallow: /portal
Disallow: /app
Disallow: /hq
Disallow: /sales

# =============================================
# SITEMAP
# =============================================
Sitemap: https://behavehealth.com/sitemap-index.xml

# =============================================
# INDEXNOW (Bing, Yandex, Seznam, Naver)
# Protocol: https://www.indexnow.org/documentation
# Key file: /ddeb41ee957342dd85ebd4c899327781.txt
# Google ignores this directive (does not participate in IndexNow).
# =============================================
IndexNow: ddeb41ee957342dd85ebd4c899327781

# =============================================
# LLM CONTEXT FILES
# See: https://llmstxt.org/
# =============================================
# Machine-readable site descriptions for AI retrieval agents
# llms.txt = concise overview, llms-full.txt = comprehensive with all URLs

# =============================================
# SCHEMA MAP (NLWeb Schema Feeds spec)
# Points to an XML list of structured-data feeds.
# =============================================
Schemamap: https://behavehealth.com/schemamap.xml