User-agent: * Allow: / Crawl-delay: 1 # ======================================== # AI BOTS MANAGEMENT (2024-2025) - SELECTIVE PERMISSION STRATEGY # Based on latest research: Allow but control to maintain visibility # ======================================== # OpenAI (ChatGPT) bots - Strategic allowance for citations & visibility User-agent: GPTBot Crawl-delay: 30 # Allow training data collection but very slowly (30 sec = 120x less requests) Allow: / User-agent: ChatGPT-User Crawl-delay: 5 # Fast access for user-initiated requests (high-quality traffic) Allow: / User-agent: OAI-SearchBot Crawl-delay: 10 # Allow search functionality for citations Allow: / # Anthropic (Claude) bots - Allow for visibility but control aggression User-agent: ClaudeBot Crawl-delay: 30 # Very slow for bulk collection but maintain citation opportunity Allow: / User-agent: anthropic-ai Disallow: / # Block completely - too aggressive for training User-agent: Claude-User Crawl-delay: 5 # Fast for user-initiated browsing Allow: / # Google AI bots - Critical for future Google AI features User-agent: Google-Extended Crawl-delay: 15 # Moderate delay but allow (important for AI Overviews) Allow: / User-agent: Google-CloudVertexBot Crawl-delay: 20 # Meta AI - Allow but heavily throttle (very aggressive but important for social commerce) User-agent: Meta-ExternalAgent Crawl-delay: 25 # Very slow due to aggressive crawling behavior Allow: / User-agent: FacebookBot Crawl-delay: 15 # Social sharing still important Allow: / # Perplexity AI - Good citation behavior, moderate control User-agent: PerplexityBot Crawl-delay: 15 # Search indexing Allow: / User-agent: Perplexity-User Crawl-delay: 5 # User queries get fast access Allow: / # Apple Intelligence - Moderate approach User-agent: Applebot-Extended Crawl-delay: 20 Allow: / # BLOCKED: Aggressive/Commercial crawlers that don't provide value back User-agent: Bytespider # TikTok/ByteDance - very aggressive, limited citation value Disallow: / User-agent: CCBot # Common Crawl - used by many smaller AI companies without control Disallow: / User-agent: Diffbot # Commercial data scraper Disallow: / User-agent: AI2Bot # Research bot - no commercial value back Disallow: / User-agent: cohere-ai # Training crawler - no citation value Disallow: / User-agent: ImagesiftBot # Image scraping for AI training Disallow: / # ======================================== # MAGENTO 2 SYSTEM DIRECTORIES & FILES # ======================================== # Core Magento 2 directories Disallow: /app/ Disallow: /bin/ Disallow: /dev/ Disallow: /lib/ Disallow: /phpserver/ Disallow: /pkginfo/ Disallow: /report/ Disallow: /setup/ Disallow: /update/ Disallow: /var/ Disallow: /vendor/ Disallow: /includes/ Disallow: /generated/ # Admin panel protection (IMPORTANT: Never expose exact admin URL!) Disallow: /admin/ # Disallow: /[custom-admin-url]/ # Modern Magento 2 API endpoints Disallow: /rest/ Disallow: /soap/ Allow: /graphql? Disallow: /graphql/*?*& # Magento functionalities and pages Disallow: /index.php/ Disallow: /catalog/product_compare/ Disallow: /catalog/category/view/ Disallow: /catalog/product/view/ Disallow: /catalog/seo_sitemap/ Disallow: /catalogsearch/ Disallow: /checkout/ Disallow: /onestepcheckout/ Disallow: /control/ Disallow: /contacts/ Disallow: /customer/ Disallow: /customize/ Disallow: /newsletter/ Disallow: /review/ Disallow: /sendfriend/ Disallow: /wishlist/ Disallow: /poll/ Disallow: /downloadable/ Disallow: /sales/guest/form/ Disallow: /productalert/ Disallow: /subscription/ Disallow: /push_notification/ # Modern extensions and Ajax calls Disallow: /ajaxcart/ Disallow: /ajax/ Disallow: /quickview/ Disallow: /mfproductsearch/ Disallow: /mfcmsdr/ Disallow: /pslogin/ # Media directories (sensitive content) Disallow: /media/customer/ Disallow: /media/downloadable/ Disallow: /media/import/ Disallow: /media/theme_customization/ Disallow: /media/catalog/product/cache/ # System and development files Disallow: /composer.json Disallow: /composer.lock Disallow: /CONTRIBUTING.md Disallow: /CONTRIBUTOR_LICENSE_AGREEMENT.html Disallow: /COPYING.txt Disallow: /Gruntfile.js Disallow: /LICENSE.html Disallow: /LICENSE.txt Disallow: /LICENSE_AFL.txt Disallow: /nginx.conf.sample Disallow: /package.json Disallow: /php.ini.sample Disallow: /RELEASE_NOTES.txt Disallow: /cron.php Disallow: /cron.sh Disallow: /error_log Disallow: /install.php Disallow: /STATUS.txt Disallow: /get.php # ======================================== # SEO CRITICAL: PRODUCT LISTS & FILTERS # ======================================== # Product list sorting and filtering (CRITICAL for SEO!) Disallow: /*?*product_list_mode= Disallow: /*?*product_list_order= Disallow: /*?*product_list_limit= Disallow: /*?*product_list_dir= # Session ID and tracking parameters Disallow: /*?SID= Disallow: /*?utm_source= Disallow: /*?utm_medium= Disallow: /*?utm_campaign= Disallow: /*?utm_id= Disallow: /*?utm_term= Disallow: /*?utm_content= Disallow: /*?gad_source= Disallow: /*?gad_campaignid= Disallow: /*?gclid= Disallow: /*?_gl= Disallow: /*?gbraid= Disallow: /*?wbraid= Disallow: /*?amp= Disallow: /*?__gads= Disallow: /*?__atuv= Disallow: /*?filter= Disallow: /*?channable= Disallow: /*?fbclid= # Allow useful parameters Allow: /*?page= Allow: /*?p= Disallow: /*?p=*& # Layered Navigation (multi-level filters) Disallow: /*/-/*/*/*/*/ Disallow: /*/-/*-*-*-* # Search results Disallow: /catalogsearch/result/ Disallow: /search/ # File type based exclusions Disallow: /*.php$ Disallow: /*.CVS Disallow: /*.Zip$ Disallow: /*.Svn$ Disallow: /*.Idea$ Disallow: /*.Sql$ Disallow: /*.Tgz$ # ======================================== # NETHERLANDS SPECIFIC PRIVACY PAGES # ======================================== # GDPR/AVG specific pages (Dutch) - Both folder and file versions Disallow: /privacy/ Disallow: /privacy.html Disallow: /cookiebeleid/ Disallow: /cookiebeleid.html Disallow: /privacyverklaring/ Disallow: /privacyverklaring.html Disallow: /algemene-voorwaarden/ Disallow: /algemene-voorwaarden.html Disallow: /gebruiksvoorwaarden/ Disallow: /gebruiksvoorwaarden.html # Other security directories Disallow: /security/ Disallow: /backup/ Disallow: /logs/ Disallow: /tmp/ # Development and test environment Disallow: /test/ Disallow: /dev/ Disallow: /staging/ Disallow: /daniel/ # ======================================== # DUTCH SEARCH ENGINES & BOTS # ======================================== # Primary search engines for Dutch market User-agent: Googlebot Allow: / User-agent: DuckDuckBot Allow: / User-agent: bingbot Allow: / # SEO tools with controlled crawl delay User-agent: SemrushBot Crawl-delay: 5 User-agent: AhrefsBot Crawl-delay: 5 User-agent: MJ12bot Crawl-delay: 10 # Dutch specific or aggressive crawlers User-agent: SeznamBot # Czech search engine, common in EU Crawl-delay: 10 User-agent: YandexBot # Russian search engine Crawl-delay: 10 # ======================================== # SITEMAPS # ======================================== Sitemap: https://www.fietsonline.com/media/sitemap/sitemap_fo.xml # Security.txt location (modern security practice) # Sitemap: https://yourdomain.nl/.well-known/security.txt