This commit introduces a comprehensive Astro integration that automatically generates discovery files for websites: Features: - robots.txt with LLM bot support (Anthropic-AI, GPTBot, etc.) - llms.txt for AI assistant context and instructions - humans.txt for team credits and site information - Automatic sitemap integration via @astrojs/sitemap Technical Details: - TypeScript implementation with full type safety - Configurable HTTP caching headers - Custom template support for all generated files - Sensible defaults with extensive customization options - Date-based versioning (2025.11.03) Testing: - 34 unit tests covering all generators - Test coverage for robots.txt, llms.txt, and humans.txt - Integration with Vitest Documentation: - Comprehensive README with examples - API reference documentation - Contributing guidelines - Example configurations (minimal and full)
103 lines
2.5 KiB
TypeScript
103 lines
2.5 KiB
TypeScript
import type { RobotsConfig } from '../types.js';
|
|
|
|
/**
|
|
* Default LLM bot user agents that should have access to llms.txt
|
|
*/
|
|
const DEFAULT_LLM_BOTS = [
|
|
'Anthropic-AI',
|
|
'Claude-Web',
|
|
'GPTBot',
|
|
'ChatGPT-User',
|
|
'cohere-ai',
|
|
'Google-Extended',
|
|
'PerplexityBot',
|
|
'Applebot-Extended',
|
|
];
|
|
|
|
/**
|
|
* Generate robots.txt content
|
|
*
|
|
* @param config - Robots.txt configuration
|
|
* @param siteURL - Site base URL
|
|
* @returns Generated robots.txt content
|
|
*/
|
|
export function generateRobotsTxt(
|
|
config: RobotsConfig,
|
|
siteURL: URL
|
|
): string {
|
|
const lines: string[] = [];
|
|
|
|
// Header comment
|
|
lines.push('# robots.txt');
|
|
lines.push(`# Generated by @astrojs/discovery for ${siteURL.hostname}`);
|
|
lines.push('');
|
|
|
|
// Allow all bots by default
|
|
if (config.allowAllBots !== false) {
|
|
lines.push('User-agent: *');
|
|
lines.push('Allow: /');
|
|
lines.push('');
|
|
}
|
|
|
|
// Add sitemap reference
|
|
lines.push('# Sitemaps');
|
|
lines.push(`Sitemap: ${new URL('sitemap-index.xml', siteURL).href}`);
|
|
lines.push('');
|
|
|
|
// LLM-specific rules
|
|
if (config.llmBots?.enabled !== false) {
|
|
lines.push('# LLM-specific resources');
|
|
lines.push('# AI assistants can find additional context at /llms.txt');
|
|
lines.push('# See: https://github.com/anthropics/llm-txt');
|
|
lines.push('');
|
|
|
|
const agents = config.llmBots?.agents || DEFAULT_LLM_BOTS;
|
|
agents.forEach(agent => {
|
|
lines.push(`User-agent: ${agent}`);
|
|
});
|
|
lines.push('Allow: /llms.txt');
|
|
lines.push('Allow: /llms-full.txt');
|
|
lines.push('');
|
|
}
|
|
|
|
// Additional agent rules
|
|
if (config.additionalAgents && config.additionalAgents.length > 0) {
|
|
lines.push('# Custom agent rules');
|
|
lines.push('');
|
|
|
|
config.additionalAgents.forEach(agent => {
|
|
lines.push(`User-agent: ${agent.userAgent}`);
|
|
|
|
if (agent.allow && agent.allow.length > 0) {
|
|
agent.allow.forEach(path => {
|
|
lines.push(`Allow: ${path}`);
|
|
});
|
|
}
|
|
|
|
if (agent.disallow && agent.disallow.length > 0) {
|
|
agent.disallow.forEach(path => {
|
|
lines.push(`Disallow: ${path}`);
|
|
});
|
|
}
|
|
|
|
lines.push('');
|
|
});
|
|
}
|
|
|
|
// Crawl delay
|
|
if (config.crawlDelay) {
|
|
lines.push('# Crawl delay (be nice to our server)');
|
|
lines.push(`Crawl-delay: ${config.crawlDelay}`);
|
|
lines.push('');
|
|
}
|
|
|
|
// Custom rules
|
|
if (config.customRules) {
|
|
lines.push('# Custom rules');
|
|
lines.push(config.customRules.trim());
|
|
lines.push('');
|
|
}
|
|
|
|
return lines.join('\n').trim() + '\n';
|
|
}
|