# ================================================================
# robots.txt — www.sensodays.ro
# ================================================================
# NOTE: Crawl-delay is honored by well-behaved bots only.
# For time-based crawling restrictions (e.g. crawl only at night):
#   - Google Search Console > Settings > Crawl rate
#   - Cloudflare Rate Limiting rules (filter by User-Agent + hour)
#   - Nginx/Apache access rules per User-Agent
# ================================================================


# ----------------------------------------------------------------
# GOOGLEBOT — highest priority, fast crawl allowed
# Googlebot generally ignores Crawl-delay below 1s.
# Use Google Search Console to fine-tune crawl rate per property.
# ----------------------------------------------------------------
User-agent: Googlebot
Crawl-delay: 1
Allow: /
Disallow: /admin/
Disallow: /customer/
Disallow: /checkout/
Disallow: /sales/
Disallow: /sendfriend/
Disallow: /wishlist/
Disallow: /catalog/product_compare/
Disallow: /catalogsearch/
Disallow: /*?*q=
Disallow: /*?*SID=
Disallow: /*?limit=
Disallow: /wpproductlabels/
# Filter/faceted navigation pages (e.g. /chiuvete-de-inox/filtru/box.html)
# These pages carry meta noindex,nofollow but are also blocked here
# to save crawl budget. NOTE: blocking in robots.txt means Googlebot
# cannot read the noindex tag either — the net result is the same,
# but keep this in sync if the noindex policy ever changes.
Disallow: */filtru/
# Sorting and direction parameters — duplicate content, no SEO value.
# Same products as the base category, just reordered. Meta noindex
# is set on these pages but blocking here also saves crawl budget.
# Pagination (?p=2, ?p=3) is intentionally NOT blocked — Googlebot
# needs to follow links on those pages to discover all products.
Disallow: /*?*product_list_order=
Disallow: /*?*product_list_dir=

# Googlebot-Image: restricted to product images only.
# Allowing /media/catalog/product/ helps Google index product
# photos for Google Shopping and Image Search results.
User-agent: Googlebot-Image
Crawl-delay: 2
Allow: /media/catalog/product/
Disallow: /

# Googlebot-News: full access, no restrictions needed.
User-agent: Googlebot-News
Crawl-delay: 1
Allow: /

User-agent: Googlebot-Video
Crawl-delay: 5
Allow: /

# Google-Extended: used for AI/LLM training (Gemini datasets).
# Product pages blocked to avoid catalog content being used
# for AI training without attribution or compensation.
User-agent: Google-Extended
Crawl-delay: 30
Allow: /
Disallow: /admin/
Disallow: /customer/
Disallow: /checkout/
Disallow: /catalog/product/
Disallow: /catalogsearch/
Disallow: /wpproductlabels/


# ----------------------------------------------------------------
# BINGBOT & MICROSOFT
# Bingbot powers Bing Search and Bing Shopping.
# BingPreview generates link previews in Outlook and Teams.
# ----------------------------------------------------------------
User-agent: Bingbot
Crawl-delay: 3
Allow: /
Disallow: /admin/
Disallow: /customer/
Disallow: /checkout/
Disallow: /sales/
Disallow: /wishlist/
Disallow: /catalogsearch/
Disallow: /wpproductlabels/
Disallow: */filtru/
Disallow: /*?*product_list_order=
Disallow: /*?*product_list_dir=

User-agent: BingPreview
Crawl-delay: 10
Allow: /
Disallow: /admin/


# ----------------------------------------------------------------
# OTHER SEARCH ENGINES
# Applebot: powers Spotlight Search and Siri suggestions on Apple devices.
# Yandex: dominant search engine in Russia and Eastern Europe.
# DuckDuckBot: DuckDuckGo's web crawler.
# ----------------------------------------------------------------
User-agent: Applebot
Crawl-delay: 10
Allow: /
Disallow: /admin/
Disallow: /customer/

User-agent: Yandex
Crawl-delay: 10
Allow: /
Disallow: /admin/
Disallow: /customer/

User-agent: DuckDuckBot
Crawl-delay: 5
Allow: /
Disallow: /admin/


# ----------------------------------------------------------------
# FACEBOOK / META — aggressively rate-limited
# facebookexternalhit fetches URLs shared in posts and messages
# to generate link previews. This can cause severe traffic spikes
# when a product link goes viral or is amplified by paid campaigns.
# meta-externalagent is Meta's general-purpose crawler.
# meta-externalads syncs product data for Dynamic Product Ads (DPA).
# Instagram shares crawling infrastructure with Meta's other bots.
#
# NOTE: Meta crawlers (especially facebookexternalhit and Facebot)
# often ignore robots.txt for link preview generation. The WAF
# enforces the /filtru/ block at the application level as a safety net.
# ----------------------------------------------------------------
User-agent: facebookexternalhit
Crawl-delay: 30
Allow: /
Disallow: /admin/
Disallow: /customer/
Disallow: /checkout/
Disallow: /catalogsearch/
Disallow: /wpproductlabels/
Disallow: /catalog/product_compare/
# Filtru pages generate infinite URL combinations — blocked to protect
# Redis cache and OpenSearch from being overwhelmed
Disallow: */filtru/

User-agent: Facebot
Crawl-delay: 30
Allow: /
Disallow: /admin/
Disallow: /wpproductlabels/
# Filtru pages generate infinite URL combinations — blocked to protect
# Redis cache and OpenSearch from being overwhelmed
Disallow: */filtru/

User-agent: meta-externalagent
Crawl-delay: 60
Allow: /
Disallow: /admin/
Disallow: /customer/
Disallow: /checkout/
Disallow: /catalog/product/
Disallow: /catalogsearch/
Disallow: /wpproductlabels/
Disallow: */filtru/

User-agent: meta-externalads
Crawl-delay: 60
Allow: /
Disallow: /admin/
Disallow: /catalog/
Disallow: /wpproductlabels/

User-agent: Instagram
Crawl-delay: 30
Allow: /
Disallow: /admin/
Disallow: /catalog/


# ----------------------------------------------------------------
# AI / LLM BOTS — access allowed, rate reduced
# These bots crawl content to train or power AI assistants.
# Product pages blocked to protect proprietary catalog data.
# GPTBot + ChatGPT-User + OpenAI-SearchBot: OpenAI crawlers.
# ClaudeBot + Claude-Web + anthropic-ai: Anthropic crawlers.
# PerplexityBot: powers Perplexity AI's real-time answer engine.
# cohere-ai: Cohere's LLM training crawler.
# YouBot: You.com's search and AI assistant crawler.
# CCBot: Common Crawl — a public dataset used by many LLMs.
# ----------------------------------------------------------------
User-agent: GPTBot
Crawl-delay: 30
Allow: /
Disallow: /admin/
Disallow: /customer/
Disallow: /checkout/
Disallow: /catalog/product/
Disallow: /wpproductlabels/

User-agent: ChatGPT-User
Crawl-delay: 60
Allow: /
Disallow: /admin/
Disallow: /catalog/

User-agent: OpenAI-SearchBot
Crawl-delay: 30
Allow: /
Disallow: /admin/
Disallow: /catalog/product/

User-agent: ClaudeBot
Crawl-delay: 30
Allow: /
Disallow: /admin/
Disallow: /customer/
Disallow: /catalog/product/

User-agent: Claude-Web
Crawl-delay: 60
Allow: /
Disallow: /admin/
Disallow: /catalog/

User-agent: anthropic-ai
Crawl-delay: 60
Allow: /
Disallow: /admin/
Disallow: /catalog/

User-agent: PerplexityBot
Crawl-delay: 30
Allow: /
Disallow: /admin/
Disallow: /catalog/product/

User-agent: cohere-ai
Crawl-delay: 60
Allow: /
Disallow: /admin/
Disallow: /catalog/

User-agent: YouBot
Crawl-delay: 60
Allow: /
Disallow: /admin/
Disallow: /catalog/

User-agent: CCBot
Crawl-delay: 120
Allow: /
Disallow: /admin/
Disallow: /catalog/
Disallow: /customer/


# ----------------------------------------------------------------
# TIKTOK / BYTEDANCE
# ByteDanceSpider is TikTok's general-purpose crawler.
# TikTokBot fetches URLs shared in TikTok posts and bios
# to generate link previews and content cards.
# ----------------------------------------------------------------
User-agent: ByteDanceSpider
Crawl-delay: 30
Allow: /
Disallow: /admin/
Disallow: /customer/
Disallow: /checkout/

User-agent: TikTokBot
Crawl-delay: 30
Allow: /
Disallow: /admin/
Disallow: /customer/


# ----------------------------------------------------------------
# SEO / AUDIT TOOLS
# Used by marketers and agencies for site audits, backlink
# analysis, and keyword research. These bots do not affect
# search rankings but can generate noticeable crawl traffic.
# SemrushBot: Semrush suite.
# AhrefsBot: Ahrefs backlink index.
# MJ12bot: Majestic SEO crawler.
# DotBot: Moz's web crawler (Open Site Explorer).
# DataForSeoBot: DataForSEO SERP and rank tracking platform.
# ----------------------------------------------------------------
User-agent: SemrushBot
Crawl-delay: 30
Allow: /
Disallow: /admin/
Disallow: /customer/
Disallow: /checkout/

User-agent: AhrefsBot
Crawl-delay: 30
Allow: /
Disallow: /admin/

User-agent: MJ12bot
Crawl-delay: 60
Allow: /
Disallow: /admin/
Disallow: /customer/

User-agent: DotBot
Crawl-delay: 30
Allow: /
Disallow: /admin/

User-agent: DataForSeoBot
Crawl-delay: 30
Allow: /
Disallow: /admin/


# ----------------------------------------------------------------
# INTERNET ARCHIVE (Wayback Machine)
# Archives public web pages for historical preservation.
# Restricted from customer and checkout flows for privacy reasons.
# ----------------------------------------------------------------
User-agent: ia_archiver
Crawl-delay: 30
Allow: /
Disallow: /admin/
Disallow: /customer/
Disallow: /checkout/


# ----------------------------------------------------------------
# BAD BOTS — fully blocked
# Scrapers, offline site downloaders, and email harvesters
# with no legitimate indexing or preview purpose.
# ----------------------------------------------------------------
User-agent: SiteSnagger
Disallow: /

User-agent: WebCopier
Disallow: /

User-agent: WebReaper
Disallow: /

User-agent: HTTrack
Disallow: /

User-agent: Teleport
Disallow: /

User-agent: WebZIP
Disallow: /

User-agent: WebStripper
Disallow: /

User-agent: Offline Explorer
Disallow: /

User-agent: EmailCollector
Disallow: /

User-agent: EmailSiphon
Disallow: /

User-agent: WebBandit
Disallow: /

User-agent: DittoSpyder
Disallow: /

User-agent: EroCrawler
Disallow: /

User-agent: LinkWalker
Disallow: /

User-agent: MSIECrawler
Disallow: /

User-agent: Zeus
Disallow: /

User-agent: BackDoorBot
Disallow: /

User-agent: JennyBot
Disallow: /

User-agent: LexiBot
Disallow: /

User-agent: SpankBot
Disallow: /

User-agent: BotALot
Disallow: /

User-agent: Openbot
Disallow: /

User-agent: Foobot
Disallow: /

User-agent: searchpreview
Disallow: /


# ----------------------------------------------------------------
# DEFAULT — catch-all for any unspecified bot
# Unknown crawlers get a conservative crawl delay and are blocked
# from all sensitive and duplicate-content paths.
# ----------------------------------------------------------------
User-agent: *
Crawl-delay: 10
Allow: /
Disallow: /admin/
Disallow: /customer/
Disallow: /checkout/
Disallow: /sales/
Disallow: /sendfriend/
Disallow: /wishlist/
Disallow: /catalog/product_compare/
Disallow: /catalogsearch/
Disallow: /*?*q=
Disallow: /*?*SID=
Disallow: /*?limit=
Disallow: /wpproductlabels/
# Filter/faceted navigation pages — noindex,nofollow in meta tags
Disallow: */filtru/
# Sorting parameters — duplicate content, no SEO value
Disallow: /*?*product_list_order=
Disallow: /*?*product_list_dir=

https://www.sensodays.ro/sensodays/sitemap_index.xml