# robots.txt for https://readaloud.net
# Place at: https://readaloud.net/robots.txt (HTTP 200, Content-Type: text/plain)
#
# Policy: fully open. Search engines and AI crawlers are welcome everywhere.
# Dedupe of tracking-parameter URLs is handled by canonical tags, not here.

# ============================================================
# Default rule — all crawlers (search engines + unknown bots)
# ============================================================
User-agent: *
Allow: /

# ============================================================
# AI / LLM crawlers — explicit welcome for clarity of intent
# ============================================================

# OpenAI
User-agent: GPTBot
Allow: /

User-agent: OAI-SearchBot
Allow: /

User-agent: ChatGPT-User
Allow: /

# Anthropic (Claude)
User-agent: ClaudeBot
Allow: /

User-agent: Claude-User
Allow: /

User-agent: Claude-SearchBot
Allow: /

User-agent: anthropic-ai
Allow: /

# Google (Gemini / Vertex AI — separate from Googlebot)
User-agent: Google-Extended
Allow: /

User-agent: GoogleOther
Allow: /

# Perplexity
User-agent: PerplexityBot
Allow: /

User-agent: Perplexity-User
Allow: /

# Apple Intelligence
User-agent: Applebot-Extended
Allow: /

# Meta AI
User-agent: Meta-ExternalAgent
Allow: /

User-agent: FacebookBot
Allow: /

# Amazon (Alexa / Rufus)
User-agent: Amazonbot
Allow: /

# ByteDance (Doubao / TikTok AI)
User-agent: Bytespider
Allow: /

# Cohere
User-agent: cohere-ai
Allow: /

# Mistral
User-agent: MistralAI-User
Allow: /

# DuckDuckGo AI
User-agent: DuckAssistBot
Allow: /

# You.com
User-agent: YouBot
Allow: /

# Common Crawl (dataset used by many LLMs)
User-agent: CCBot
Allow: /

# Diffbot (knowledge graph consumed by AI tools)
User-agent: Diffbot
Allow: /

# ============================================================
# Sitemaps and LLM index
# ============================================================
Sitemap: https://readaloud.net/sitemap.xml

# Notes:
# - /llms.txt and /llms-full.txt are published at the domain root
#   so conformant LLM agents can discover them automatically.
# - Do NOT block /_next/ — rendering bots need CSS/JS from there.
# - Do NOT block ?utm_*, ?gclid, ?fbclid — dedupe via canonical tags.
#
# IMPORTANT — APP SUBDOMAIN
# This file only governs https://readaloud.net (robots.txt is per-host).
# The primary product lives at https://app.readaloud.net — ship an
# equivalent permissive robots.txt there as well:
#
#   https://app.readaloud.net/robots.txt
#     User-agent: *
#     Allow: /
#     (plus the same 22 AI-crawler Allow blocks as above)
#     Sitemap: https://app.readaloud.net/sitemap.xml
#
# Without a robots.txt on the app subdomain, some crawlers will default
# to conservative behaviour and skip the web app entirely.