# As a condition of accessing this website, you agree to abide by the following
# content signals:

# (a)  If a Content-Signal = yes, you may collect content for the corresponding
#      use.
# (b)  If a Content-Signal = no, you may not collect content for the
#      corresponding use.
# (c)  If the website operator does not include a Content-Signal for a
#      corresponding use, the website operator neither grants nor restricts
#      permission via Content-Signal with respect to the corresponding use.

# The content signals and their meanings are:

# search:   building a search index and providing search results (e.g., returning
#           hyperlinks and short excerpts from your website's contents). Search does not
#           include providing AI-generated search summaries.
# ai-input: inputting content into one or more AI models (e.g., retrieval
#           augmented generation, grounding, or other real-time taking of content for
#           generative AI search answers).
# ai-train: training or fine-tuning AI models.

# ANY RESTRICTIONS EXPRESSED VIA CONTENT SIGNALS ARE EXPRESS RESERVATIONS OF
# RIGHTS UNDER ARTICLE 4 OF THE EUROPEAN UNION DIRECTIVE 2019/790 ON COPYRIGHT
# AND RELATED RIGHTS IN THE DIGITAL SINGLE MARKET.

# BEGIN Cloudflare Managed content

User-agent: *
Content-Signal: search=yes,ai-train=no
Allow: /

User-agent: Amazonbot
Disallow: /

User-agent: Applebot-Extended
Disallow: /

User-agent: Bytespider
Disallow: /

User-agent: CCBot
Disallow: /

User-agent: ClaudeBot
Disallow: /

User-agent: CloudflareBrowserRenderingCrawler
Disallow: /

User-agent: Google-Extended
Disallow: /

User-agent: GPTBot
Disallow: /

User-agent: meta-externalagent
Disallow: /

# END Cloudflare Managed Content

# robots.txt for The Edinburgh Reporter

# Sitemaps (Yoast-style index; adjust if different)
Sitemap: https://theedinburghreporter.co.uk/sitemap_index.xml

# Default policy for all crawlers
User-agent: *
Disallow: /wp-admin/
Allow: /wp-admin/admin-ajax.php

# Keep search results, reply links, and utility params out of the index
Disallow: /?s=
Disallow: /search/
Disallow: /*?replytocom=
Disallow: /*?share=
Disallow: /*?relatedposts=
Disallow: /cgi-bin/

# Ensure media can be crawled (important for Google News / Discover)
Allow: /wp-content/uploads/

# --- Helpful per-bot tuning -------------------------------------------------

# Google ignores Crawl-delay; explicit allow keeps things simple
User-agent: Googlebot
Allow: /

User-agent: Googlebot-Image
Allow: /

# Microsoft Bing honors Crawl-delay
User-agent: bingbot
Allow: /
Crawl-delay: 5

# Pinterest
User-agent: Pinterestbot
Allow: /

# British Library Legal Deposit crawler (UK news archives)
User-agent: bl.uk_ldfc_bot
Allow: /

# OpenAI SearchBot (site search/discovery)
User-agent: OAI-SearchBot
Allow: /

# Ad/brand-safety and sentiment crawlers — allowed but throttled
User-agent: ias-or
Crawl-delay: 10

User-agent: SentiBot
Crawl-delay: 10

# Common SEO crawlers — don’t block, just slow a bit
User-agent: AhrefsBot
Crawl-delay: 10

User-agent: SemrushBot
Crawl-delay: 10

User-agent: MJ12bot
Crawl-delay: 10