# TRUE.th robots.txt - AI Crawlers & GEO Optimization
# Version: 2.2 (April 2026)
# Purpose: Optimize for AI Overviews, ChatGPT, Perplexity, Claude, and other AI


# SECTION 1: DEFAULT RULES FOR ALL CRAWLERS (Production Only)
User-agent: *
Allow: /
Disallow: /admin/
Disallow: /private/
Disallow: /cart/
Disallow: /checkout/
Disallow: /account/
Disallow: /user-profile/
Disallow: /api/
Disallow: /tmp/
Disallow: /cache/
Disallow: /search
Disallow: /*.pdf$
Request-rate: 10/1m

# SECTION 2: GOOGLE AI CRAWLERS (Google AI Overviews, SGE)
# Google AI (formerly SGE - Search Generative Experience)
User-agent: Google-Extended
Allow: /
Disallow: /admin/
Disallow: /private/

# Google's main crawler (also indexes for AI Overviews)
User-agent: Googlebot
Allow: /
Disallow: /admin/
Disallow: /private/

# Google's mobile crawler
User-agent: Googlebot-Mobile
Allow: /
Disallow: /admin/

# SECTION 3: OPENAI CRAWLERS (ChatGPT Web Search, GPT-4)
# ChatGPT Web Search & GPT Models
User-agent: GPTBot
Allow: /
Disallow: /admin/
Disallow: /private/
Disallow: /api/

# OpenAI Search Bot
User-agent: OAI-SearchBot
Allow: /
Disallow: /admin/
Disallow: /private/

# ChatGPT User-Triggered Browsing
User-agent: ChatGPT-User
Allow: /
Disallow: /admin/

# SECTION 4: ANTHROPIC CRAWLERS (Claude, Claude Web Search)
# Claude Web Search & Claude Models
User-agent: ClaudeBot
Allow: /
Disallow: /admin/
Disallow: /private/
Disallow: /api/

# Anthropic AI Training
User-agent: anthropic-ai
Allow: /
Disallow: /admin/

# SECTION 5: PERPLEXITY CRAWLERS (Perplexity AI Search)
# Perplexity AI Search Engine
User-agent: PerplexityBot
Allow: /
Disallow: /admin/
Disallow: /private/
Disallow: /api/

# Perplexity's secondary crawler
User-agent: Perplexity-Web
Allow: /
Disallow: /admin/

# SECTION 6: MICROSOFT CRAWLERS (Bing AI, Copilot, Microsoft Search)
# Bing's crawler (also indexes for Copilot)
User-agent: Bingbot
Allow: /
Disallow: /admin/
Disallow: /private/

# Microsoft Bing Image Bot
User-agent: BingPreview
Allow: /
Disallow: /admin/

# Microsoft Search Bot
User-agent: MSNBot
Allow: /
Disallow: /admin/

# SECTION 7: ADDITIONAL AI CRAWLERS (Cohere, Together AI, Replicate, etc.)
# Cohere AI Models
User-agent: cohere-ai
Allow: /
Disallow: /admin/
Disallow: /api/

# Together AI (distributed AI platform)
User-agent: Together-AI
Allow: /
Disallow: /admin/

# Replicate (AI model hosting)
User-agent: Replicate
Allow: /
Disallow: /admin/

# ByteSpider (ByteDance AI - TikTok/Douyin)
User-agent: Bytespider
Allow: /
Disallow: /admin/
Disallow: /private/

# Meta AI/Facebook AI Research
User-agent: Meta-AI
Allow: /
Disallow: /admin/

# Baidu AI (Chinese search/AI)
User-agent: Baiduspider
Allow: /
Disallow: /admin/

# Yandex AI (Russian search)
User-agent: YandexBot
Allow: /
Disallow: /admin/

# SECTION 8: TRAINING DATA CRAWLERS (Block for privacy, optional)
# Common Crawl (training data)
User-agent: CCBot
Disallow: /

# Archive.org Wayback Machine
User-agent: archive.org_bot
Allow: /public/
Disallow: /private/

# SECTION 9: SOCIAL MEDIA & CONTENT CRAWLERS
# Facebook's crawler
User-agent: facebookexternalhit
Allow: /
Disallow: /admin/

# Twitter/X crawler
User-agent: Twitterbot
Allow: /
Disallow: /admin/

# LinkedIn crawler
User-agent: LinkedInBot
Allow: /
Disallow: /admin/

# WhatsApp crawler
User-agent: WhatsApp
Allow: /
Disallow: /admin/

# Slack bot
User-agent: Slackbot
Allow: /
Disallow: /admin/

# Telegram crawler
User-agent: TelegramBot
Allow: /
Disallow: /admin/

# Discord crawler
User-agent: Discordbot
Allow: /
Disallow: /admin/

# SECTION 10: SITEMAPS
# Main sitemaps
Sitemap: https://www.true.th/sitemap.xml
Sitemap: https://www.true.th/blog-sitemap.xml
Sitemap: https://www.true.th/store-sitemap.xml
Sitemap: https://business.true.th/sitemap.xml
Sitemap: https://investor.true.th/sitemap.xml

# llms.txt for AI crawler guidance
Sitemap: https://www.true.th/llms.txt

# IndexNow notification
# Sitemap: https://www.true.th/indexnow-notification

# SECTION 11: ADDITIONAL DIRECTIVES FOR AI OPTIMIZATION
# Clean URLs without tracking parameters
Allow: /?utm_*
Allow: /?fbclid=*

# Allow JSON-LD schema files
Allow: /schema/*.json

# Allow robots.txt itself for verification
Allow: /robots.txt

# Allow well-known files
Allow: /.well-known/