# ================================================================ # robots.txt — www.sensodays.ro # ================================================================ # NOTE: Crawl-delay is honored by well-behaved bots only. # For time-based crawling restrictions (e.g. crawl only at night): # - Google Search Console > Settings > Crawl rate # - Cloudflare Rate Limiting rules (filter by User-Agent + hour) # - Nginx/Apache access rules per User-Agent # ================================================================ # ---------------------------------------------------------------- # GOOGLEBOT — highest priority, fast crawl allowed # Googlebot generally ignores Crawl-delay below 1s. # Use Google Search Console to fine-tune crawl rate per property. # ---------------------------------------------------------------- User-agent: Googlebot Crawl-delay: 1 Allow: / Disallow: /admin/ Disallow: /customer/ Disallow: /checkout/ Disallow: /sales/ Disallow: /sendfriend/ Disallow: /wishlist/ Disallow: /catalog/product_compare/ Disallow: /catalogsearch/ Disallow: /*?*q= Disallow: /*?*SID= Disallow: /*?limit= Disallow: /wpproductlabels/ # Filter/faceted navigation pages (e.g. /chiuvete-de-inox/filtru/box.html) # These pages carry meta noindex,nofollow but are also blocked here # to save crawl budget. NOTE: blocking in robots.txt means Googlebot # cannot read the noindex tag either — the net result is the same, # but keep this in sync if the noindex policy ever changes. Disallow: */filtru/ # Sorting and direction parameters — duplicate content, no SEO value. # Same products as the base category, just reordered. Meta noindex # is set on these pages but blocking here also saves crawl budget. # Pagination (?p=2, ?p=3) is intentionally NOT blocked — Googlebot # needs to follow links on those pages to discover all products. Disallow: /*?*product_list_order= Disallow: /*?*product_list_dir= # Googlebot-Image: restricted to product images only. # Allowing /media/catalog/product/ helps Google index product # photos for Google Shopping and Image Search results. User-agent: Googlebot-Image Crawl-delay: 2 Allow: /media/catalog/product/ Disallow: / # Googlebot-News: full access, no restrictions needed. User-agent: Googlebot-News Crawl-delay: 1 Allow: / User-agent: Googlebot-Video Crawl-delay: 5 Allow: / # Google-Extended: used for AI/LLM training (Gemini datasets). # Product pages blocked to avoid catalog content being used # for AI training without attribution or compensation. User-agent: Google-Extended Crawl-delay: 30 Allow: / Disallow: /admin/ Disallow: /customer/ Disallow: /checkout/ Disallow: /catalog/product/ Disallow: /catalogsearch/ Disallow: /wpproductlabels/ # ---------------------------------------------------------------- # BINGBOT & MICROSOFT # Bingbot powers Bing Search and Bing Shopping. # BingPreview generates link previews in Outlook and Teams. # ---------------------------------------------------------------- User-agent: Bingbot Crawl-delay: 3 Allow: / Disallow: /admin/ Disallow: /customer/ Disallow: /checkout/ Disallow: /sales/ Disallow: /wishlist/ Disallow: /catalogsearch/ Disallow: /wpproductlabels/ Disallow: */filtru/ Disallow: /*?*product_list_order= Disallow: /*?*product_list_dir= User-agent: BingPreview Crawl-delay: 10 Allow: / Disallow: /admin/ # ---------------------------------------------------------------- # OTHER SEARCH ENGINES # Applebot: powers Spotlight Search and Siri suggestions on Apple devices. # Yandex: dominant search engine in Russia and Eastern Europe. # DuckDuckBot: DuckDuckGo's web crawler. # ---------------------------------------------------------------- User-agent: Applebot Crawl-delay: 10 Allow: / Disallow: /admin/ Disallow: /customer/ User-agent: Yandex Crawl-delay: 10 Allow: / Disallow: /admin/ Disallow: /customer/ User-agent: DuckDuckBot Crawl-delay: 5 Allow: / Disallow: /admin/ # ---------------------------------------------------------------- # FACEBOOK / META — aggressively rate-limited # facebookexternalhit fetches URLs shared in posts and messages # to generate link previews. This can cause severe traffic spikes # when a product link goes viral or is amplified by paid campaigns. # meta-externalagent is Meta's general-purpose crawler. # meta-externalads syncs product data for Dynamic Product Ads (DPA). # Instagram shares crawling infrastructure with Meta's other bots. # # NOTE: Meta crawlers (especially facebookexternalhit and Facebot) # often ignore robots.txt for link preview generation. The WAF # enforces the /filtru/ block at the application level as a safety net. # ---------------------------------------------------------------- User-agent: facebookexternalhit Crawl-delay: 30 Allow: / Disallow: /admin/ Disallow: /customer/ Disallow: /checkout/ Disallow: /catalogsearch/ Disallow: /wpproductlabels/ Disallow: /catalog/product_compare/ # Filtru pages generate infinite URL combinations — blocked to protect # Redis cache and OpenSearch from being overwhelmed Disallow: */filtru/ User-agent: Facebot Crawl-delay: 30 Allow: / Disallow: /admin/ Disallow: /wpproductlabels/ # Filtru pages generate infinite URL combinations — blocked to protect # Redis cache and OpenSearch from being overwhelmed Disallow: */filtru/ User-agent: meta-externalagent Crawl-delay: 60 Allow: / Disallow: /admin/ Disallow: /customer/ Disallow: /checkout/ Disallow: /catalog/product/ Disallow: /catalogsearch/ Disallow: /wpproductlabels/ Disallow: */filtru/ User-agent: meta-externalads Crawl-delay: 60 Allow: / Disallow: /admin/ Disallow: /catalog/ Disallow: /wpproductlabels/ User-agent: Instagram Crawl-delay: 30 Allow: / Disallow: /admin/ Disallow: /catalog/ # ---------------------------------------------------------------- # AI / LLM BOTS — access allowed, rate reduced # These bots crawl content to train or power AI assistants. # Product pages blocked to protect proprietary catalog data. # GPTBot + ChatGPT-User + OpenAI-SearchBot: OpenAI crawlers. # ClaudeBot + Claude-Web + anthropic-ai: Anthropic crawlers. # PerplexityBot: powers Perplexity AI's real-time answer engine. # cohere-ai: Cohere's LLM training crawler. # YouBot: You.com's search and AI assistant crawler. # CCBot: Common Crawl — a public dataset used by many LLMs. # ---------------------------------------------------------------- User-agent: GPTBot Crawl-delay: 30 Allow: / Disallow: /admin/ Disallow: /customer/ Disallow: /checkout/ Disallow: /catalog/product/ Disallow: /wpproductlabels/ User-agent: ChatGPT-User Crawl-delay: 60 Allow: / Disallow: /admin/ Disallow: /catalog/ User-agent: OpenAI-SearchBot Crawl-delay: 30 Allow: / Disallow: /admin/ Disallow: /catalog/product/ User-agent: ClaudeBot Crawl-delay: 30 Allow: / Disallow: /admin/ Disallow: /customer/ Disallow: /catalog/product/ User-agent: Claude-Web Crawl-delay: 60 Allow: / Disallow: /admin/ Disallow: /catalog/ User-agent: anthropic-ai Crawl-delay: 60 Allow: / Disallow: /admin/ Disallow: /catalog/ User-agent: PerplexityBot Crawl-delay: 30 Allow: / Disallow: /admin/ Disallow: /catalog/product/ User-agent: cohere-ai Crawl-delay: 60 Allow: / Disallow: /admin/ Disallow: /catalog/ User-agent: YouBot Crawl-delay: 60 Allow: / Disallow: /admin/ Disallow: /catalog/ User-agent: CCBot Crawl-delay: 120 Allow: / Disallow: /admin/ Disallow: /catalog/ Disallow: /customer/ # ---------------------------------------------------------------- # TIKTOK / BYTEDANCE # ByteDanceSpider is TikTok's general-purpose crawler. # TikTokBot fetches URLs shared in TikTok posts and bios # to generate link previews and content cards. # ---------------------------------------------------------------- User-agent: ByteDanceSpider Crawl-delay: 30 Allow: / Disallow: /admin/ Disallow: /customer/ Disallow: /checkout/ User-agent: TikTokBot Crawl-delay: 30 Allow: / Disallow: /admin/ Disallow: /customer/ # ---------------------------------------------------------------- # SEO / AUDIT TOOLS # Used by marketers and agencies for site audits, backlink # analysis, and keyword research. These bots do not affect # search rankings but can generate noticeable crawl traffic. # SemrushBot: Semrush suite. # AhrefsBot: Ahrefs backlink index. # MJ12bot: Majestic SEO crawler. # DotBot: Moz's web crawler (Open Site Explorer). # DataForSeoBot: DataForSEO SERP and rank tracking platform. # ---------------------------------------------------------------- User-agent: SemrushBot Crawl-delay: 30 Allow: / Disallow: /admin/ Disallow: /customer/ Disallow: /checkout/ User-agent: AhrefsBot Crawl-delay: 30 Allow: / Disallow: /admin/ User-agent: MJ12bot Crawl-delay: 60 Allow: / Disallow: /admin/ Disallow: /customer/ User-agent: DotBot Crawl-delay: 30 Allow: / Disallow: /admin/ User-agent: DataForSeoBot Crawl-delay: 30 Allow: / Disallow: /admin/ # ---------------------------------------------------------------- # INTERNET ARCHIVE (Wayback Machine) # Archives public web pages for historical preservation. # Restricted from customer and checkout flows for privacy reasons. # ---------------------------------------------------------------- User-agent: ia_archiver Crawl-delay: 30 Allow: / Disallow: /admin/ Disallow: /customer/ Disallow: /checkout/ # ---------------------------------------------------------------- # BAD BOTS — fully blocked # Scrapers, offline site downloaders, and email harvesters # with no legitimate indexing or preview purpose. # ---------------------------------------------------------------- User-agent: SiteSnagger Disallow: / User-agent: WebCopier Disallow: / User-agent: WebReaper Disallow: / User-agent: HTTrack Disallow: / User-agent: Teleport Disallow: / User-agent: WebZIP Disallow: / User-agent: WebStripper Disallow: / User-agent: Offline Explorer Disallow: / User-agent: EmailCollector Disallow: / User-agent: EmailSiphon Disallow: / User-agent: WebBandit Disallow: / User-agent: DittoSpyder Disallow: / User-agent: EroCrawler Disallow: / User-agent: LinkWalker Disallow: / User-agent: MSIECrawler Disallow: / User-agent: Zeus Disallow: / User-agent: BackDoorBot Disallow: / User-agent: JennyBot Disallow: / User-agent: LexiBot Disallow: / User-agent: SpankBot Disallow: / User-agent: BotALot Disallow: / User-agent: Openbot Disallow: / User-agent: Foobot Disallow: / User-agent: searchpreview Disallow: / # ---------------------------------------------------------------- # DEFAULT — catch-all for any unspecified bot # Unknown crawlers get a conservative crawl delay and are blocked # from all sensitive and duplicate-content paths. # ---------------------------------------------------------------- User-agent: * Crawl-delay: 10 Allow: / Disallow: /admin/ Disallow: /customer/ Disallow: /checkout/ Disallow: /sales/ Disallow: /sendfriend/ Disallow: /wishlist/ Disallow: /catalog/product_compare/ Disallow: /catalogsearch/ Disallow: /*?*q= Disallow: /*?*SID= Disallow: /*?limit= Disallow: /wpproductlabels/ # Filter/faceted navigation pages — noindex,nofollow in meta tags Disallow: */filtru/ # Sorting parameters — duplicate content, no SEO value Disallow: /*?*product_list_order= Disallow: /*?*product_list_dir= https://www.sensodays.ro/sensodays/sitemap_index.xml