# CMCC Foundation - Robots.txt
# Updated: December 2025

# Sitemap (Yoast SEO)
Sitemap: https://www.cmcc.it/sitemap_index.xml

# Default crawlers - Allow all public content
User-agent: *
Allow: /

# Block admin and internal pages
Disallow: /wp-admin/
Disallow: /wp-includes/
Disallow: /wp-content/plugins/
Disallow: /wp-content/themes/
Disallow: /wp-content/cache/
Disallow: /wp-content/uploads/wpo-plugins-tables-list.json

# Block search and filter parameters (avoid duplicate content)
Disallow: /*?s=
Disallow: /*&s=
Disallow: /*?p=
Disallow: /*&p=
Disallow: /search/
Disallow: /?s=

# Block publications filter parameters (avoid bot crawling of filter pages)
# These filters are handled client-side and generate expensive server queries
Disallow: /*?f_persone=
Disallow: /*&f_persone=
Disallow: /*?f_searchkey=
Disallow: /*&f_searchkey=
Disallow: /*?f_divisions=
Disallow: /*&f_divisions=
Disallow: /*?filter_topics=
Disallow: /*&filter_topics=

# Block AJAX and API endpoints from search engines
Disallow: /wp-json/
Disallow: /api/
Disallow: /wp-admin/admin-ajax.php

# Block feed URLs (avoid duplicate content)
Disallow: /feed/
Disallow: /*/feed/
Disallow: /comments/feed/

# Block WordPress login/register
Disallow: /wp-login.php
Disallow: /wp-signup.php
Disallow: /wp-register.php

# Block trackback
Disallow: /trackback/
Disallow: /*/trackback/

# Block xmlrpc
Disallow: /xmlrpc.php

# Allow important public pages explicitly
Allow: /sitemap_index.xml
Allow: /robots.txt
Allow: /publications-type/
Allow: /people/
Allow: /projects/
Allow: /article/

# Crawl delay for all bots (prevent server overload)
Crawl-delay: 10

# Special bot handling - Aggressive crawlers
User-agent: MegaIndex.ru/2.0
User-agent: MegaIndex.ru
Crawl-delay: 60

# AhrefsBot - Block completely (commercial SEO scraper, heavy on publications)
User-agent: AhrefsBot
Disallow: /

# SemrushBot - Block completely (commercial SEO scraper)
User-agent: SemrushBot
Disallow: /

# DotBot - Block completely
User-agent: DotBot
Disallow: /

# MJ12bot (Majestic) - Block completely
User-agent: MJ12bot
Disallow: /

# # AI Training bots - Block completely (prevent content scraping)
# User-agent: CCBot
# User-agent: ChatGPT-User
# User-agent: GPTBot
# User-agent: Google-Extended
# User-agent: anthropic-ai
# User-agent: Claude-Web
# User-agent: Omgilibot
# User-agent: PerplexityBot
# User-agent: Applebot-Extended
# Disallow: /

# Cookie consent bot - Block publications to avoid unnecessary indexing
User-agent: CookieYesbot
Disallow: /publications-type/

# BytePlus/ByteDance bots - Rate limit to prevent redirect loops
User-agent: ByteBot
User-agent: Bytespider
User-agent: ToutiaoSpider
Crawl-delay: 30

# # Amazon Bot - Rate limit
# User-agent: Amazonbot
# Crawl-delay: 30