# Robots.txt for Baselight Data Platform
# Generated for optimal SEO and crawling efficiency

User-agent: *

# Set crawl delay to be respectful (1 second)
Crawl-delay: 1

# Link sitemap
Sitemap: https://baselight.app/sitemap.xml

# Allow indexing of main public pages
Allow: /
Allow: /catalog
Allow: /catalog?*

# Allow indexing of all public user content and profiles
Allow: /u/*/

# Block private/authenticated areas
Disallow: /auth
Disallow: /register
Disallow: /refresh
Disallow: /logout
Disallow: /invite
Disallow: /settings/
Disallow: /my-*
Disallow: /query/new

# Block API endpoints and AJAX requests
Disallow: /ajax/
Disallow: /_actions/
Disallow: /api/

# Block export/download endpoints (avoid bandwidth waste)
Disallow: */results.*
Disallow: */export.*

# Block specific query edit pages (but allow view pages)
Disallow: */edit
Disallow: */edit/

# Block error pages
Disallow: /403
Disallow: /404
Disallow: /500
Disallow: /error

# Block development/admin paths
Disallow: /admin

# Set crawl delay for specific bots as prevent them from being too aggressive
User-agent: Bingbot
Crawl-delay: 2

User-agent: YandexBot
Crawl-delay: 2

# Stop aggressive crawlers from consuming resources by setting a high crawl delay
User-agent: AhrefsBot
Crawl-delay: 10

User-agent: SemrushBot
Crawl-delay: 10

User-agent: DotBot
Crawl-delay: 10

User-agent: MJ12bot
Crawl-delay: 10

User-agent: BLEXBot
Crawl-delay: 10

# Social media crawlers inherit global rules (no special config needed)
# facebookexternalhit/1.1, Twitterbot, LinkedInBot use default rules