Ryan Malloy d25dde4627 feat: initial implementation of @astrojs/discovery integration
This commit introduces a comprehensive Astro integration that automatically
generates discovery files for websites:

Features:
- robots.txt with LLM bot support (Anthropic-AI, GPTBot, etc.)
- llms.txt for AI assistant context and instructions
- humans.txt for team credits and site information
- Automatic sitemap integration via @astrojs/sitemap

Technical Details:
- TypeScript implementation with full type safety
- Configurable HTTP caching headers
- Custom template support for all generated files
- Sensible defaults with extensive customization options
- Date-based versioning (2025.11.03)

Testing:
- 34 unit tests covering all generators
- Test coverage for robots.txt, llms.txt, and humans.txt
- Integration with Vitest

Documentation:
- Comprehensive README with examples
- API reference documentation
- Contributing guidelines
- Example configurations (minimal and full)
2025-11-03 07:36:39 -07:00

103 lines
2.5 KiB
TypeScript

import type { RobotsConfig } from '../types.js';
/**
* Default LLM bot user agents that should have access to llms.txt
*/
const DEFAULT_LLM_BOTS = [
'Anthropic-AI',
'Claude-Web',
'GPTBot',
'ChatGPT-User',
'cohere-ai',
'Google-Extended',
'PerplexityBot',
'Applebot-Extended',
];
/**
* Generate robots.txt content
*
* @param config - Robots.txt configuration
* @param siteURL - Site base URL
* @returns Generated robots.txt content
*/
export function generateRobotsTxt(
config: RobotsConfig,
siteURL: URL
): string {
const lines: string[] = [];
// Header comment
lines.push('# robots.txt');
lines.push(`# Generated by @astrojs/discovery for ${siteURL.hostname}`);
lines.push('');
// Allow all bots by default
if (config.allowAllBots !== false) {
lines.push('User-agent: *');
lines.push('Allow: /');
lines.push('');
}
// Add sitemap reference
lines.push('# Sitemaps');
lines.push(`Sitemap: ${new URL('sitemap-index.xml', siteURL).href}`);
lines.push('');
// LLM-specific rules
if (config.llmBots?.enabled !== false) {
lines.push('# LLM-specific resources');
lines.push('# AI assistants can find additional context at /llms.txt');
lines.push('# See: https://github.com/anthropics/llm-txt');
lines.push('');
const agents = config.llmBots?.agents || DEFAULT_LLM_BOTS;
agents.forEach(agent => {
lines.push(`User-agent: ${agent}`);
});
lines.push('Allow: /llms.txt');
lines.push('Allow: /llms-full.txt');
lines.push('');
}
// Additional agent rules
if (config.additionalAgents && config.additionalAgents.length > 0) {
lines.push('# Custom agent rules');
lines.push('');
config.additionalAgents.forEach(agent => {
lines.push(`User-agent: ${agent.userAgent}`);
if (agent.allow && agent.allow.length > 0) {
agent.allow.forEach(path => {
lines.push(`Allow: ${path}`);
});
}
if (agent.disallow && agent.disallow.length > 0) {
agent.disallow.forEach(path => {
lines.push(`Disallow: ${path}`);
});
}
lines.push('');
});
}
// Crawl delay
if (config.crawlDelay) {
lines.push('# Crawl delay (be nice to our server)');
lines.push(`Crawl-delay: ${config.crawlDelay}`);
lines.push('');
}
// Custom rules
if (config.customRules) {
lines.push('# Custom rules');
lines.push(config.customRules.trim());
lines.push('');
}
return lines.join('\n').trim() + '\n';
}