# ═══════════════════════════════════════════════════════════════════════════════
# robots.txt - Tax-Fin-Lex Legal Portal
# Slovenia's Premier Legal Information Portal
# https://www.tax-fin-lex.si/robots.txt
# ═══════════════════════════════════════════════════════════════════════════════
# Last Updated: 2026-03-23
# Optimized for: SEO, AI/LLM discoverability, security
# Policy: ALLOW AI crawlers for legal citation visibility
# ═══════════════════════════════════════════════════════════════════════════════

# ┌─────────────────────────────────────────────────────────────────────────────┐
# │ 1. DEFAULT RULES                                                            │
# └─────────────────────────────────────────────────────────────────────────────┘
User-agent: *
Allow: /
# Note: Crawl-delay removed - Google ignores it, use Search Console instead

# ┌─────────────────────────────────────────────────────────────────────────────┐
# │ 2. MAJOR SEARCH ENGINES                                                     │
# └─────────────────────────────────────────────────────────────────────────────┘

User-agent: Googlebot
Allow: /

User-agent: Googlebot-Image
Allow: /theme/images/
Allow: /content/images/

User-agent: Bingbot
Allow: /

User-agent: Slurp
Allow: /

User-agent: DuckDuckBot
Allow: /

User-agent: Yandex
Allow: /
Crawl-delay: 2

User-agent: Baiduspider
Allow: /
Crawl-delay: 5

# ┌─────────────────────────────────────────────────────────────────────────────┐
# │ 3. AI SEARCH CRAWLERS (Real-time queries, citations) - WELCOME!             │
# │    These DO NOT train models, only fetch for live user requests             │
# └─────────────────────────────────────────────────────────────────────────────┘

# OpenAI - Search features (not training)
User-agent: ChatGPT-User
Allow: /

User-agent: OAI-SearchBot
Allow: /

# Anthropic Claude - User requests and search
User-agent: Claude-User
Allow: /

User-agent: Claude-SearchBot
Allow: /

# Perplexity - AI search engine
User-agent: PerplexityBot
Allow: /

User-agent: Perplexity-User
Allow: /

# You.com - AI search
User-agent: YouBot
Allow: /

# ┌─────────────────────────────────────────────────────────────────────────────┐
# │ 4. AI TRAINING CRAWLERS - ALLOWED FOR LEGAL CONTENT VISIBILITY              │
# │    We WANT our legal content in AI knowledge bases for proper citations     │
# └─────────────────────────────────────────────────────────────────────────────┘

# OpenAI GPT training
User-agent: GPTBot
Allow: /

# Anthropic Claude training
User-agent: ClaudeBot
Allow: /

User-agent: anthropic-ai
Allow: /

# Google AI (Gemini, Bard)
User-agent: Google-Extended
Allow: /

User-agent: Google-CloudVertexBot
Allow: /

# Apple (Siri, Apple Intelligence)
User-agent: Applebot
Allow: /

User-agent: Applebot-Extended
Allow: /

# Meta AI (Llama)
User-agent: Meta-ExternalAgent
Allow: /

User-agent: Meta-ExternalFetcher
Allow: /

User-agent: FacebookBot
Allow: /

# Microsoft/Bing AI (Copilot)
User-agent: bingbot
Allow: /

# Cohere AI
User-agent: cohere-ai
Allow: /

# Amazon (Alexa, AWS AI)
User-agent: Amazonbot
Allow: /

# Common Crawl - Allow legal content (used by many AI models)
User-agent: CCBot
Allow: /dokument/
Allow: /sodnapraksa/
Allow: /zbirke/
Allow: /iskanje/
Allow: /tfl-ai/
Disallow: /

# ┌─────────────────────────────────────────────────────────────────────────────┐
# │ 5. SECURITY - BLOCKED PATHS (for ALL crawlers)                              │
# └─────────────────────────────────────────────────────────────────────────────┘

User-agent: *

# Administrative and system
Disallow: /admin/
Disallow: /api/
Disallow: /app_data/
Disallow: /app_start/
Disallow: /appcode/
Disallow: /bin/
Disallow: /obj/
Disallow: /tmp/
Disallow: /old/
Disallow: /error/
Disallow: /elmah.axd

# User accounts and private content
Disallow: /uporabnik/
Disallow: /account/
Disallow: /tflradar/dodajdokumentvtflradar

# Internal/AJAX endpoints
Disallow: /dokument/nodedetail*
Disallow: /dokument/getnode*
Disallow: /dokument/commentadd*
Disallow: /dokument/commentdelete*
Disallow: /home/odstranikomentar*
Disallow: /companycheck/arhivpodjetijexcel*
Disallow: /companycheck/createexcel*
Disallow: /home/bannerref*
Disallow: /home/logclick*
Disallow: /*/_*
Disallow: /*/partial*

# Search results (thin content)
Disallow: /search/
Disallow: /iskanje/results*

# ┌─────────────────────────────────────────────────────────────────────────────┐
# │ 6. DUPLICATE CONTENT PREVENTION                                             │
# │    Block URL parameters that create duplicate pages                         │
# └─────────────────────────────────────────────────────────────────────────────┘

User-agent: *

# Session and authentication
Disallow: /*?sid=*
Disallow: /*?sessionid=*
Disallow: /*?PHPSESSID=*
Disallow: /*?jsessionid=*

# Print and export variants
Disallow: /*?print=*
Disallow: /*?export=*
Disallow: /*?pdf=*
Disallow: /*?format=*

# UTM and marketing tracking
Disallow: /*?utm_*
Disallow: /*?ref=*
Disallow: /*?source=*
Disallow: /*?campaign=*
Disallow: /*?fbclid=*
Disallow: /*?gclid=*
Disallow: /*?msclkid=*

# Sorting and filtering (creates duplicate views)
Disallow: /*?sort=*
Disallow: /*?order=*
Disallow: /*?filter=*
Disallow: /*?page=0
Disallow: /*&page=0

# Internal tracking
Disallow: /*?returnUrl=*
Disallow: /*?redirect=*

# ┌─────────────────────────────────────────────────────────────────────────────┐
# │ 7. AGGRESSIVE SCRAPERS & BAD BOTS - BLOCKED                                 │
# │    These provide no value and waste server resources                        │
# └─────────────────────────────────────────────────────────────────────────────┘

# ByteDance - aggressive, often ignores robots.txt
User-agent: Bytespider
Disallow: /

User-agent: TikTokSpider
Disallow: /

# SEO tool scrapers (use official APIs instead)
User-agent: MJ12bot
Disallow: /

User-agent: BLEXBot
Disallow: /

User-agent: SEOkicks-Robot
Disallow: /

User-agent: SEOkicks
Disallow: /

User-agent: seoscanners
Disallow: /

# Backlink scrapers
User-agent: BacklinkCrawler
Disallow: /

User-agent: BomboraBot
Disallow: /

# Data harvesters
User-agent: Diffbot
Disallow: /

User-agent: Omgili
Disallow: /

User-agent: Omgilibot
Disallow: /

User-agent: webzio-extended
Disallow: /

User-agent: ImagesiftBot
Disallow: /

# Content copiers
User-agent: TurnitinBot
Disallow: /

User-agent: Timpibot
Disallow: /

User-agent: PetalBot
Disallow: /

User-agent: Sogou
Disallow: /

# Misc bad actors
User-agent: VelenPublicWebCrawler
Disallow: /

User-agent: ICC-Crawler
Disallow: /

User-agent: builtwith
Disallow: /

User-agent: BuiltWith
Disallow: /

User-agent: Jorgee
Disallow: /

User-agent: Mb2345Browser
Disallow: /

User-agent: MegaIndex
Disallow: /

User-agent: ltx71
Disallow: /

User-agent: Cliqzbot
Disallow: /

User-agent: Seekport
Disallow: /

User-agent: serpstatbot
Disallow: /

User-agent: ZoominfoBot
Disallow: /

User-agent: DataForSeoBot
Disallow: /

# Archive bots (optional - uncomment if you want historical archiving)
# User-agent: ia_archiver
# Disallow: /

# ┌─────────────────────────────────────────────────────────────────────────────┐
# │ 8. SEO TOOLS - RATE LIMITED (useful but resource-intensive)                 │
# └─────────────────────────────────────────────────────────────────────────────┘

User-agent: AhrefsBot
Allow: /
Crawl-delay: 10

User-agent: SemrushBot
Allow: /
Crawl-delay: 10

User-agent: SemrushBot-OCOB
Allow: /
Crawl-delay: 10

User-agent: DotBot
Allow: /
Crawl-delay: 10

User-agent: rogerbot
Allow: /
Crawl-delay: 10

User-agent: MojeekBot
Allow: /
Crawl-delay: 5

# ┌─────────────────────────────────────────────────────────────────────────────┐
# │ 9. EXPLICIT ALLOWS - Important content paths                                │
# └─────────────────────────────────────────────────────────────────────────────┘

User-agent: *

# Core legal content
Allow: /dokument/
Allow: /dokument/podrobnosti
Allow: /dokument/besedilo
Allow: /sodnapraksa/
Allow: /zbirke/
Allow: /iskanje/sodnapraksa

# AI features (SEO landing pages)
Allow: /tfl-ai/
Allow: /tfl-ai/pravni-ai
Allow: /tfl-ai/pravni-ai/
Allow: /ai/

# Events and education
Allow: /dogodki/
Allow: /seminarnik/

# Publications
Allow: /publikacije/

# Landing pages
Allow: /landing/
Allow: /landingpage/

# Company services
Allow: /companycheck/predstavitev
Allow: /ekolex/predstavitev
Allow: /tflapi/predstavitev

# Static assets (for rich snippets)
Allow: /theme/
Allow: /theme/images/
Allow: /scripts/
Allow: /content/
Allow: /content/images/

# Sitemaps and discovery files
Allow: /sitemap*.xml
Allow: /llms.txt
Allow: /llms-full.txt
Allow: /robots.txt
Allow: /favicon.ico

# ┌─────────────────────────────────────────────────────────────────────────────┐
# │ 10. SITEMAPS                                                                │
# └─────────────────────────────────────────────────────────────────────────────┘

Sitemap: https://www.tax-fin-lex.si/sitemap.xml
Sitemap: https://www.tax-fin-lex.si/sitemap-news.xml

# ┌─────────────────────────────────────────────────────────────────────────────┐
# │ 11. AI/LLM CONTEXT FILES                                                    │
# └─────────────────────────────────────────────────────────────────────────────┘

# AI context files for LLM understanding
# Concise: https://www.tax-fin-lex.si/llms.txt
# Full:    https://www.tax-fin-lex.si/llms-full.txt
# Contains: Site description, legal abbreviations, citation format, API docs

# TDM (Text and Data Mining) Reservation - EU Copyright Directive Art. 4 compliance
# Location: https://www.tax-fin-lex.si/.well-known/tdmrep.json
# Policy: tdm-reservation=0 (AI mining ALLOWED for legal content)

# ═══════════════════════════════════════════════════════════════════════════════
# END OF ROBOTS.TXT
# For questions: info@tax-fin-lex.si
# ═══════════════════════════════════════════════════════════════════════════════