User-agent: *
Disallow:


# Training crawler — builds datasets for GPT model development
User-agent: GPTBot
Allow: /


# Search indexing crawler — powers ChatGPT Search citations
# WARNING: Blocking this removes your site from ChatGPT Search answers entirely
User-agent: OAI-SearchBot
Allow: /


# User-triggered fetcher — activated when a human pastes a URL into ChatGPT
# Note: OpenAI states this may not always respect robots.txt on user request
User-agent: ChatGPT-User
Allow: /


# Training crawler — used for Claude model training data collection
User-agent: ClaudeBot
Allow: /


# Search indexing crawler — powers Claude's web search and citation features
# WARNING: Anthropic states blocking this "may reduce" visibility in Claude answers
User-agent: Claude-SearchBot
Allow: /


# User-triggered fetcher — activated when a Claude user requests a specific page
User-agent: Claude-User
Allow: /


# Primary indexing crawler — builds Perplexity's AI search index
User-agent: PerplexityBot
Allow: /


# User-triggered retrieval — fetches pages when a Perplexity user clicks a citation
User-agent: Perplexity-User
Allow: /


# Controls Gemini model training and Vertex AI data collection
User-agent: Google-Extended
Allow: /


# Gemini-specific fetch for AI answer grounding
User-agent: Googlebot-Extended
Allow: /


User-agent: Meta-ExternalAgent
Allow: /


# ------------------------------------------------------------------------------
# CCBot — Common Crawl (open dataset used by many AI labs)
# Common Crawl data underlies training for GPT, LLaMA, Mistral, and many others.
# Allowing CCBot is a broad opt-in for open AI research datasets.
# ------------------------------------------------------------------------------


User-agent: CCBot
Allow: /




Sitemap: https://www.techmagnate.com/sitemap_index.xml