# Rate limiting for compliant bots (reduced for news crawlers) User-agent: * Crawl-delay: 5 # WordPress security while preserving news access Disallow: /wp-admin/ Allow: /wp-admin/admin-ajax.php Disallow: /wp-includes/ Disallow: /wp-content/plugins/ Disallow: /wp-content/themes/ Disallow: /cgi-bin/ Disallow: /trackback/ Disallow: /xmlrpc.php Disallow: /?s= Disallow: /comment-page- Disallow: /author/*/page/ # Avoid duplicate crawl from junk params Disallow: /*?utm_ Disallow: /*?replytocom # CRITICAL: Allow news feeds and API endpoints Allow: /feed/ Allow: /category/*/feed/ Allow: /tag/*/feed/ Allow: /wp-json/wp/v2/posts Allow: /wp-json/wp/v2/categories # Social media crawlers (important for news sharing) User-agent: facebookexternalhit Allow: / User-agent: Twitterbot Allow: / User-agent: LinkedInBot Allow: / # Preserve Google News access User-agent: Googlebot-News Disallow: /wp-admin/ Allow: / User-agent: Googlebot Crawl-delay: 1 Allow: / # Bing News crawler User-agent: bingbot Crawl-delay: 2 Allow: / # Apple News User-agent: Applebot Allow: / # Block SEO research & AI crawlers User-agent: AhrefsBot Disallow: / User-agent: SemrushBot Disallow: / User-agent: MJ12bot Disallow: / User-agent: DotBot Disallow: / User-agent: BLEXBot Disallow: / User-agent: DataForSeoBot Disallow: / User-agent: ScreamingFrogSEOSpider Disallow: / User-agent: PetalBot Disallow: / User-agent: YandexBot Disallow: / User-agent: PerplexityBot Disallow: / User-agent: GPTBot Disallow: / # Allow Wayback Machine (beneficial for news sites) User-agent: archive.org_bot Crawl-delay: 10 Disallow: /wp-admin/ Allow: / User-agent: ia_archiver Crawl-delay: 10 Disallow: /wp-admin/ Allow: / User-agent: Meta-ExternalAgent Crawl-delay: 10 # Sitemaps (critical for news sites) Sitemap: https://film-book.com/sitemap_index.xml Sitemap: https://film-book.com/news-sitemap.xml