# CMCC Foundation - Robots.txt # Updated: December 2025 # Sitemap (Yoast SEO) Sitemap: https://www.cmcc.it/sitemap_index.xml # Default crawlers - Allow all public content User-agent: * Allow: / # Block admin and internal pages Disallow: /wp-admin/ Disallow: /wp-includes/ Disallow: /wp-content/plugins/ Disallow: /wp-content/themes/ Disallow: /wp-content/cache/ Disallow: /wp-content/uploads/wpo-plugins-tables-list.json # Block search and filter parameters (avoid duplicate content) Disallow: /*?s= Disallow: /*&s= Disallow: /*?p= Disallow: /*&p= Disallow: /search/ Disallow: /?s= # Block publications filter parameters (avoid bot crawling of filter pages) # These filters are handled client-side and generate expensive server queries Disallow: /*?f_persone= Disallow: /*&f_persone= Disallow: /*?f_searchkey= Disallow: /*&f_searchkey= Disallow: /*?f_divisions= Disallow: /*&f_divisions= Disallow: /*?filter_topics= Disallow: /*&filter_topics= # Block AJAX and API endpoints from search engines Disallow: /wp-json/ Disallow: /api/ Disallow: /wp-admin/admin-ajax.php # Block feed URLs (avoid duplicate content) Disallow: /feed/ Disallow: /*/feed/ Disallow: /comments/feed/ # Block WordPress login/register Disallow: /wp-login.php Disallow: /wp-signup.php Disallow: /wp-register.php # Block trackback Disallow: /trackback/ Disallow: /*/trackback/ # Block xmlrpc Disallow: /xmlrpc.php # Allow important public pages explicitly Allow: /sitemap_index.xml Allow: /robots.txt Allow: /publications-type/ Allow: /people/ Allow: /projects/ Allow: /article/ # Crawl delay for all bots (prevent server overload) Crawl-delay: 10 # Special bot handling - Aggressive crawlers User-agent: MegaIndex.ru/2.0 User-agent: MegaIndex.ru Crawl-delay: 60 # AhrefsBot - Block completely (commercial SEO scraper, heavy on publications) User-agent: AhrefsBot Disallow: / # SemrushBot - Block completely (commercial SEO scraper) User-agent: SemrushBot Disallow: / # DotBot - Block completely User-agent: DotBot Disallow: / # MJ12bot (Majestic) - Block completely User-agent: MJ12bot Disallow: / # # AI Training bots - Block completely (prevent content scraping) # User-agent: CCBot # User-agent: ChatGPT-User # User-agent: GPTBot # User-agent: Google-Extended # User-agent: anthropic-ai # User-agent: Claude-Web # User-agent: Omgilibot # User-agent: PerplexityBot # User-agent: Applebot-Extended # Disallow: / # Cookie consent bot - Block publications to avoid unnecessary indexing User-agent: CookieYesbot Disallow: /publications-type/ # BytePlus/ByteDance bots - Rate limit to prevent redirect loops User-agent: ByteBot User-agent: Bytespider User-agent: ToutiaoSpider Crawl-delay: 30 # # Amazon Bot - Rate limit # User-agent: Amazonbot # Crawl-delay: 30