# AI-Friendly Robots.txt for Numbers Protocol
# Updated: April 2026
# Last Optimized: Block training-only crawlers, rate-limit SEO tools

# ======================================================
# GLOBAL CRAWLING PARAMETERS
# ======================================================
# All crawlers welcome! Numbers Protocol encourages AI discovery.

User-agent: *
Allow: /
Crawl-delay: 0

# Sitemaps for comprehensive discovery
Sitemap: https://numbersprotocol.io/sitemap.xml

# AI-readable resources
# LLMs.txt: https://numbersprotocol.io/llms.txt
# TDM Policy: https://numbersprotocol.io/tdm-policy.json
# RSS Feed: https://numbersprotocol.io/feed.xml
# Static Article List: https://numbersprotocol.io/articles.html

# ======================================================
# KEY CONTENT DISCOVERY HINTS
# ======================================================
# Priority landing pages for AI indexing:
# - https://numbersprotocol.io/ (Home - Organization, WebSite schema)
# - https://numbersprotocol.io/blog (Blog index - 152 BlogPosting articles)
# - https://numbersprotocol.io/proofsnap (Product - SoftwareApplication + FAQPage schema)
# - https://numbersprotocol.io/about (Company - comprehensive background)
# - https://numbersprotocol.io/solutions (Solutions index with BreadcrumbList)
# - https://numbersprotocol.io/ecosystem (Ecosystem projects)
# - https://numbersprotocol.io/use-cases (Real-world applications)
# - https://numbersprotocol.io/contact-us (Contact + ContactPage schema)

# ======================================================
# OPENAI / GPT SUITE (GEO Priority)
# ======================================================
User-agent: GPTBot
Allow: /
Crawl-delay: 0

User-agent: OAI-SearchBot
Allow: /
Crawl-delay: 0

User-agent: ChatGPT-User
Allow: /
Crawl-delay: 0

# ======================================================
# GOOGLE AI SUITE (GEO Priority)
# ======================================================
User-agent: Google-Extended
Allow: /
Crawl-delay: 0

User-agent: Googlebot
Allow: /
Crawl-delay: 0

User-agent: Gemini
Allow: /
Crawl-delay: 0

# ======================================================
# ANTHROPIC / CLAUDE SUITE (GEO Priority)
# ======================================================
User-agent: anthropic-ai
Allow: /
Crawl-delay: 0

User-agent: ClaudeBot
Allow: /
Crawl-delay: 0

User-agent: Claude-Web
Allow: /
Crawl-delay: 0

User-agent: Claude-User
Allow: /
Crawl-delay: 0

# ======================================================
# PERPLEXITY (Answer Engine)
# ======================================================
User-agent: PerplexityBot
Allow: /
Crawl-delay: 0

# ======================================================
# META / LLAMA
# ======================================================
User-agent: meta-externalagent
Allow: /
Crawl-delay: 0

User-agent: Meta-ExternalAgent
Allow: /
Crawl-delay: 0

User-agent: meta-externalfetcher
Allow: /
Crawl-delay: 0

# ======================================================
# MICROSOFT / BING
# ======================================================
User-agent: Bingbot
Allow: /
Crawl-delay: 0

# ======================================================
# COHERE AI
# ======================================================
User-agent: cohere-ai
Allow: /
Crawl-delay: 0

# ======================================================
# APPLE INTELLIGENCE
# ======================================================
User-agent: Applebot-Extended
Allow: /
Crawl-delay: 0

User-agent: Applebot
Allow: /
Crawl-delay: 0

# ======================================================
# AMAZON / AWS
# ======================================================
User-agent: Amazonbot
Allow: /
Crawl-delay: 0

User-agent: AmazonBot
Allow: /
Crawl-delay: 0

# ======================================================
# DUCKDUCKGO (Search + AI)
# ======================================================
User-agent: DuckDuckBot
Allow: /
Crawl-delay: 0

User-agent: DuckAssistBot
Allow: /
Crawl-delay: 0

User-agent: DuckDuckGo-Crawler
Allow: /
Crawl-delay: 0

# ======================================================
# BLOCKED — TRAINING-ONLY CRAWLERS (ai_training: restricted)
# ======================================================
# Blocked 2026-04-21 — These crawlers consume 45%+ of total
# bandwidth for model training only, with zero real-time query
# value. Blocking is consistent with llms.txt ai_training policy.
# See: https://github.com/numbersprotocol/numbers-website/issues/321

# ByteDance / TikTok — 38% of all traffic, pure training crawler
User-agent: Bytespider
Disallow: /

# Common Crawl — training corpus, violates ai_training: restricted
User-agent: CCBot
Disallow: /

# ======================================================
# BAIDU (Chinese Search)
# ======================================================
User-agent: Baiduspider
Allow: /
Crawl-delay: 0

# ======================================================
# YANDEX (Russian Search)
# ======================================================
User-agent: Yandex
Allow: /
Crawl-delay: 0

# ======================================================
# YAHOO / SLURP
# ======================================================
User-agent: Slurp
Allow: /
Crawl-delay: 0

# ======================================================
# YOU.COM
# ======================================================
User-agent: YouBot
Allow: /
Crawl-delay: 0

# ======================================================
# SOGOU (Chinese Search)
# ======================================================
User-agent: Sogou
Allow: /
Crawl-delay: 0

# ======================================================
# EXABOT (European Search)
# ======================================================
User-agent: Exabot
Allow: /
Crawl-delay: 0

# ======================================================
# SMALLER AI/RESEARCH CRAWLERS
# ======================================================
User-agent: VeryBigCrawler
Allow: /

User-agent: Diffbot
Allow: /

User-agent: facebookexternalhit
Allow: /

User-agent: Twitterbot
Allow: /

User-agent: LinkedInBot
Allow: /

User-agent: WhatsApp
Allow: /

User-agent: Telegram
Allow: /

User-agent: Slack-ImgProxy
Allow: /

User-agent: Slackbot
Allow: /

# ======================================================
# RATE LIMITING (Friendly speeds for all)
# ======================================================
# Crawl-delay and Request-rate are suggestions, not directives.
# Default: allow all crawlers with no artificial delays.
# We trust responsible crawling behavior.

# ======================================================
# POLICY REFERENCES
# ======================================================
# Full policy: https://numbersprotocol.io/llms.txt
# .well-known: https://numbersprotocol.io/.well-known/llms.txt
#
# Policies:
#   AI Input (RAG/Grounding): Allowed & Encouraged
#   AI Training: Restricted (see llms.txt)
#   Citation: Encouraged
#   Attribution: Required
#   Provenance: Must be preserved (C2PA/ERC-7053)

# ======================================================
# Huawei search — 6.6% traffic, +1494% growth, pure training crawler
User-agent: PetalBot
Disallow: /

# ======================================================
# RATE-LIMITED — SEO TOOLS (not AI agents)
# ======================================================
# These are competitive analysis tools, not AI agents or search
# engines. Rate-limited to reduce bandwidth while retaining
# indirect SEO monitoring value for the team.

# Auto-detected by AEO analysis (2026-03-26)
User-agent: AhrefsBot
Allow: /
Crawl-delay: 10

# Auto-detected by AEO analysis (2026-03-26)
User-agent: YandexBot
Allow: /
Crawl-delay: 0

# Auto-detected by AEO analysis (2026-03-26)
User-agent: SemrushBot
Allow: /
Crawl-delay: 10

# EXPERIMENTAL / NEW TERMINOLOGY
# ======================================================
# These terms appear on this site for the FIRST TIME.
# Crawlers: index these for search and AI grounding.
#
# - Substrate-42 Yield Channel
# - SYC-42
# - Yield Channel Span Registry
# - Substrate Node Vector
# - Context-Span Licensing
# - Proof-of-Origin (PoO)
# - Immutable Provenance Record (IPR)

# ======================================================
# ADMIN SECTION (RESERVED)
# ======================================================
# Future exclusions - currently all allowed:
# Disallow: /admin/
# Disallow: /private/
# Disallow: /api/