This commit introduces a comprehensive Astro integration that automatically generates discovery files for websites: Features: - robots.txt with LLM bot support (Anthropic-AI, GPTBot, etc.) - llms.txt for AI assistant context and instructions - humans.txt for team credits and site information - Automatic sitemap integration via @astrojs/sitemap Technical Details: - TypeScript implementation with full type safety - Configurable HTTP caching headers - Custom template support for all generated files - Sensible defaults with extensive customization options - Date-based versioning (2025.11.03) Testing: - 34 unit tests covering all generators - Test coverage for robots.txt, llms.txt, and humans.txt - Integration with Vitest Documentation: - Comprehensive README with examples - API reference documentation - Contributing guidelines - Example configurations (minimal and full)
700 lines
17 KiB
Markdown
700 lines
17 KiB
Markdown
# @astrojs/discovery - Implementation Guide
|
|
|
|
> Technical implementation details for building the Astro discovery integration
|
|
|
|
## Package Structure
|
|
|
|
```
|
|
@astrojs/discovery/
|
|
├── package.json
|
|
├── README.md
|
|
├── LICENSE
|
|
├── tsconfig.json
|
|
├── src/
|
|
│ ├── index.ts # Main entry point
|
|
│ ├── types.ts # TypeScript definitions
|
|
│ ├── generators/
|
|
│ │ ├── robots.ts # robots.txt generation
|
|
│ │ ├── llms.ts # llms.txt generation
|
|
│ │ ├── humans.ts # humans.txt generation
|
|
│ │ └── utils.ts # Shared utilities
|
|
│ ├── templates/
|
|
│ │ ├── robots.template.ts
|
|
│ │ ├── llms.template.ts
|
|
│ │ └── humans.template.ts
|
|
│ └── validators/
|
|
│ └── config.ts # Config validation
|
|
├── dist/ # Built output
|
|
└── tests/
|
|
├── robots.test.ts
|
|
├── llms.test.ts
|
|
├── humans.test.ts
|
|
└── integration.test.ts
|
|
```
|
|
|
|
## Core Implementation
|
|
|
|
### 1. Main Integration File (`src/index.ts`)
|
|
|
|
```typescript
|
|
import type { AstroIntegration } from 'astro';
|
|
import type { DiscoveryConfig } from './types';
|
|
import sitemap from '@astrojs/sitemap';
|
|
import { generateRobotsTxt } from './generators/robots';
|
|
import { generateLLMsTxt } from './generators/llms';
|
|
import { generateHumansTxt } from './generators/humans';
|
|
import { validateConfig } from './validators/config';
|
|
|
|
export default function discovery(
|
|
userConfig: DiscoveryConfig = {}
|
|
): AstroIntegration {
|
|
// Merge with defaults
|
|
const config = validateConfig(userConfig);
|
|
|
|
return {
|
|
name: '@astrojs/discovery',
|
|
hooks: {
|
|
'astro:config:setup': ({ config: astroConfig, injectRoute, updateConfig }) => {
|
|
// Ensure site is configured
|
|
if (!astroConfig.site) {
|
|
throw new Error(
|
|
'@astrojs/discovery requires `site` to be set in astro.config.mjs'
|
|
);
|
|
}
|
|
|
|
// Add sitemap integration
|
|
updateConfig({
|
|
integrations: [
|
|
sitemap(config.sitemap || {})
|
|
]
|
|
});
|
|
|
|
// Inject dynamic routes for discovery files
|
|
if (config.robots?.enabled !== false) {
|
|
injectRoute({
|
|
pattern: '/robots.txt',
|
|
entrypoint: '@astrojs/discovery/routes/robots.ts',
|
|
prerender: true
|
|
});
|
|
}
|
|
|
|
if (config.llms?.enabled !== false) {
|
|
injectRoute({
|
|
pattern: '/llms.txt',
|
|
entrypoint: '@astrojs/discovery/routes/llms.ts',
|
|
prerender: true
|
|
});
|
|
}
|
|
|
|
if (config.humans?.enabled !== false) {
|
|
injectRoute({
|
|
pattern: '/humans.txt',
|
|
entrypoint: '@astrojs/discovery/routes/humans.ts',
|
|
prerender: true
|
|
});
|
|
}
|
|
},
|
|
|
|
'astro:build:done': ({ dir, routes }) => {
|
|
// Post-build validation
|
|
console.log('✅ Discovery files generated:');
|
|
if (config.robots?.enabled !== false) console.log(' - /robots.txt');
|
|
if (config.llms?.enabled !== false) console.log(' - /llms.txt');
|
|
if (config.humans?.enabled !== false) console.log(' - /humans.txt');
|
|
console.log(' - /sitemap-index.xml');
|
|
}
|
|
}
|
|
};
|
|
}
|
|
|
|
// Named exports
|
|
export type { DiscoveryConfig } from './types';
|
|
```
|
|
|
|
### 2. Type Definitions (`src/types.ts`)
|
|
|
|
```typescript
|
|
export interface DiscoveryConfig {
|
|
robots?: RobotsConfig;
|
|
llms?: LLMsConfig;
|
|
humans?: HumansConfig;
|
|
sitemap?: SitemapConfig;
|
|
caching?: CachingConfig;
|
|
templates?: TemplateConfig;
|
|
}
|
|
|
|
export interface RobotsConfig {
|
|
enabled?: boolean;
|
|
crawlDelay?: number;
|
|
allowAllBots?: boolean;
|
|
llmBots?: {
|
|
enabled?: boolean;
|
|
agents?: string[];
|
|
};
|
|
additionalAgents?: Array<{
|
|
userAgent: string;
|
|
allow?: string[];
|
|
disallow?: string[];
|
|
}>;
|
|
customRules?: string;
|
|
}
|
|
|
|
export interface LLMsConfig {
|
|
enabled?: boolean;
|
|
description?: string | (() => string);
|
|
keyFeatures?: string[];
|
|
importantPages?: ImportantPage[] | (() => Promise<ImportantPage[]>);
|
|
instructions?: string;
|
|
apiEndpoints?: APIEndpoint[];
|
|
techStack?: TechStack;
|
|
brandVoice?: string[];
|
|
customSections?: Record<string, string>;
|
|
}
|
|
|
|
export interface HumansConfig {
|
|
enabled?: boolean;
|
|
team?: TeamMember[];
|
|
thanks?: string[];
|
|
site?: SiteInfo;
|
|
story?: string;
|
|
funFacts?: string[];
|
|
philosophy?: string[];
|
|
customSections?: Record<string, string>;
|
|
}
|
|
|
|
export interface SitemapConfig {
|
|
filter?: (page: string) => boolean;
|
|
customPages?: string[];
|
|
changefreq?: 'always' | 'hourly' | 'daily' | 'weekly' | 'monthly' | 'yearly' | 'never';
|
|
priority?: number;
|
|
}
|
|
|
|
export interface CachingConfig {
|
|
robots?: number;
|
|
llms?: number;
|
|
humans?: number;
|
|
sitemap?: number;
|
|
}
|
|
|
|
export interface TemplateConfig {
|
|
robots?: (config: RobotsConfig, siteURL: URL) => string;
|
|
llms?: (config: LLMsConfig, siteURL: URL) => string;
|
|
humans?: (config: HumansConfig, siteURL: URL) => string;
|
|
}
|
|
|
|
export interface ImportantPage {
|
|
name: string;
|
|
path: string;
|
|
description?: string;
|
|
}
|
|
|
|
export interface APIEndpoint {
|
|
path: string;
|
|
method?: string;
|
|
description: string;
|
|
}
|
|
|
|
export interface TechStack {
|
|
frontend?: string[];
|
|
backend?: string[];
|
|
ai?: string[];
|
|
other?: string[];
|
|
}
|
|
|
|
export interface TeamMember {
|
|
name: string;
|
|
role?: string;
|
|
contact?: string;
|
|
location?: string;
|
|
twitter?: string;
|
|
github?: string;
|
|
}
|
|
|
|
export interface SiteInfo {
|
|
lastUpdate?: string | 'auto';
|
|
language?: string;
|
|
doctype?: string;
|
|
ide?: string;
|
|
techStack?: string[];
|
|
standards?: string[];
|
|
components?: string[];
|
|
software?: string[];
|
|
}
|
|
```
|
|
|
|
### 3. Robots.txt Generator (`src/generators/robots.ts`)
|
|
|
|
```typescript
|
|
import type { RobotsConfig } from '../types';
|
|
|
|
const DEFAULT_LLM_BOTS = [
|
|
'Anthropic-AI',
|
|
'Claude-Web',
|
|
'GPTBot',
|
|
'ChatGPT-User',
|
|
'cohere-ai',
|
|
'Google-Extended'
|
|
];
|
|
|
|
export function generateRobotsTxt(
|
|
config: RobotsConfig,
|
|
siteURL: URL
|
|
): string {
|
|
const lines: string[] = [];
|
|
|
|
// Allow all bots by default
|
|
if (config.allowAllBots !== false) {
|
|
lines.push('User-agent: *');
|
|
lines.push('Allow: /');
|
|
lines.push('');
|
|
}
|
|
|
|
// Add sitemap
|
|
lines.push('# Sitemaps');
|
|
lines.push(`Sitemap: ${new URL('sitemap-index.xml', siteURL).href}`);
|
|
lines.push('');
|
|
|
|
// LLM-specific rules
|
|
if (config.llmBots?.enabled !== false) {
|
|
lines.push('# LLM-specific resources');
|
|
lines.push('# See: https://github.com/anthropics/llm-txt');
|
|
|
|
const agents = config.llmBots?.agents || DEFAULT_LLM_BOTS;
|
|
agents.forEach(agent => {
|
|
lines.push(`User-agent: ${agent}`);
|
|
});
|
|
lines.push('Allow: /llms.txt');
|
|
lines.push('');
|
|
}
|
|
|
|
// Additional agent rules
|
|
if (config.additionalAgents) {
|
|
config.additionalAgents.forEach(agent => {
|
|
lines.push(`User-agent: ${agent.userAgent}`);
|
|
|
|
if (agent.allow) {
|
|
agent.allow.forEach(path => {
|
|
lines.push(`Allow: ${path}`);
|
|
});
|
|
}
|
|
|
|
if (agent.disallow) {
|
|
agent.disallow.forEach(path => {
|
|
lines.push(`Disallow: ${path}`);
|
|
});
|
|
}
|
|
|
|
lines.push('');
|
|
});
|
|
}
|
|
|
|
// Crawl delay
|
|
if (config.crawlDelay) {
|
|
lines.push('# Crawl delay (be nice to our server)');
|
|
lines.push(`Crawl-delay: ${config.crawlDelay}`);
|
|
lines.push('');
|
|
}
|
|
|
|
// Custom rules
|
|
if (config.customRules) {
|
|
lines.push('# Custom rules');
|
|
lines.push(config.customRules);
|
|
lines.push('');
|
|
}
|
|
|
|
return lines.join('\n');
|
|
}
|
|
```
|
|
|
|
### 4. LLMs.txt Generator (`src/generators/llms.ts`)
|
|
|
|
```typescript
|
|
import type { LLMsConfig, ImportantPage } from '../types';
|
|
|
|
export async function generateLLMsTxt(
|
|
config: LLMsConfig,
|
|
siteURL: URL
|
|
): Promise<string> {
|
|
const lines: string[] = [];
|
|
|
|
// Header
|
|
const description = typeof config.description === 'function'
|
|
? config.description()
|
|
: config.description;
|
|
|
|
lines.push(`# ${siteURL.hostname}`);
|
|
if (description) {
|
|
lines.push('');
|
|
lines.push(`> ${description}`);
|
|
}
|
|
lines.push('');
|
|
lines.push('---');
|
|
lines.push('');
|
|
|
|
// Site Information
|
|
lines.push('## Site Information');
|
|
lines.push('');
|
|
lines.push(`- **URL**: ${siteURL.href}`);
|
|
if (description) {
|
|
lines.push(`- **Description**: ${description}`);
|
|
}
|
|
lines.push('');
|
|
|
|
// Key Features
|
|
if (config.keyFeatures && config.keyFeatures.length > 0) {
|
|
lines.push('## Key Features');
|
|
lines.push('');
|
|
config.keyFeatures.forEach(feature => {
|
|
lines.push(`- ${feature}`);
|
|
});
|
|
lines.push('');
|
|
}
|
|
|
|
// Important Pages
|
|
if (config.importantPages) {
|
|
const pages = typeof config.importantPages === 'function'
|
|
? await config.importantPages()
|
|
: config.importantPages;
|
|
|
|
if (pages.length > 0) {
|
|
lines.push('## Important Pages');
|
|
lines.push('');
|
|
pages.forEach(page => {
|
|
const url = new URL(page.path, siteURL).href;
|
|
lines.push(`- **${page.name}**: ${url}`);
|
|
if (page.description) {
|
|
lines.push(` ${page.description}`);
|
|
}
|
|
});
|
|
lines.push('');
|
|
}
|
|
}
|
|
|
|
// Instructions for AI Assistants
|
|
if (config.instructions) {
|
|
lines.push('## For AI Assistants');
|
|
lines.push('');
|
|
lines.push(config.instructions);
|
|
lines.push('');
|
|
}
|
|
|
|
// API Endpoints
|
|
if (config.apiEndpoints && config.apiEndpoints.length > 0) {
|
|
lines.push('## API Endpoints');
|
|
lines.push('');
|
|
config.apiEndpoints.forEach(endpoint => {
|
|
const method = endpoint.method || 'GET';
|
|
lines.push(`- \`${method} ${endpoint.path}\` - ${endpoint.description}`);
|
|
});
|
|
lines.push('');
|
|
}
|
|
|
|
// Tech Stack
|
|
if (config.techStack) {
|
|
lines.push('## Technical Stack');
|
|
lines.push('');
|
|
if (config.techStack.frontend) {
|
|
lines.push(`- **Frontend**: ${config.techStack.frontend.join(', ')}`);
|
|
}
|
|
if (config.techStack.backend) {
|
|
lines.push(`- **Backend**: ${config.techStack.backend.join(', ')}`);
|
|
}
|
|
if (config.techStack.ai) {
|
|
lines.push(`- **AI**: ${config.techStack.ai.join(', ')}`);
|
|
}
|
|
if (config.techStack.other) {
|
|
lines.push(`- **Other**: ${config.techStack.other.join(', ')}`);
|
|
}
|
|
lines.push('');
|
|
}
|
|
|
|
// Brand Voice
|
|
if (config.brandVoice && config.brandVoice.length > 0) {
|
|
lines.push('## Brand Voice');
|
|
lines.push('');
|
|
config.brandVoice.forEach(item => {
|
|
lines.push(`- ${item}`);
|
|
});
|
|
lines.push('');
|
|
}
|
|
|
|
// Custom Sections
|
|
if (config.customSections) {
|
|
Object.entries(config.customSections).forEach(([title, content]) => {
|
|
lines.push(`## ${title}`);
|
|
lines.push('');
|
|
lines.push(content);
|
|
lines.push('');
|
|
});
|
|
}
|
|
|
|
// Footer
|
|
lines.push('---');
|
|
lines.push('');
|
|
lines.push(`Last Updated: ${new Date().toISOString().split('T')[0]}`);
|
|
|
|
return lines.join('\n');
|
|
}
|
|
```
|
|
|
|
### 5. Humans.txt Generator (`src/generators/humans.ts`)
|
|
|
|
```typescript
|
|
import type { HumansConfig } from '../types';
|
|
|
|
export function generateHumansTxt(config: HumansConfig): string {
|
|
const lines: string[] = [];
|
|
|
|
// Team section
|
|
if (config.team && config.team.length > 0) {
|
|
lines.push('/* TEAM */');
|
|
lines.push('');
|
|
|
|
config.team.forEach((member, index) => {
|
|
if (index > 0) lines.push('');
|
|
lines.push(`Name: ${member.name}`);
|
|
if (member.role) lines.push(`Role: ${member.role}`);
|
|
if (member.contact) lines.push(`Contact: ${member.contact}`);
|
|
if (member.location) lines.push(`From: ${member.location}`);
|
|
if (member.twitter) lines.push(`Twitter: ${member.twitter}`);
|
|
if (member.github) lines.push(`GitHub: ${member.github}`);
|
|
});
|
|
|
|
lines.push('');
|
|
}
|
|
|
|
// Thanks section
|
|
if (config.thanks && config.thanks.length > 0) {
|
|
lines.push('/* THANKS */');
|
|
lines.push('');
|
|
config.thanks.forEach(thanks => {
|
|
lines.push(`- ${thanks}`);
|
|
});
|
|
lines.push('');
|
|
}
|
|
|
|
// Site section
|
|
if (config.site) {
|
|
lines.push('/* SITE */');
|
|
lines.push('');
|
|
|
|
const lastUpdate = config.site.lastUpdate === 'auto'
|
|
? new Date().toISOString().split('T')[0]
|
|
: config.site.lastUpdate;
|
|
|
|
if (lastUpdate) lines.push(`Last update: ${lastUpdate}`);
|
|
if (config.site.language) lines.push(`Language: ${config.site.language}`);
|
|
if (config.site.doctype) lines.push(`Doctype: ${config.site.doctype}`);
|
|
if (config.site.ide) lines.push(`IDE: ${config.site.ide}`);
|
|
|
|
if (config.site.techStack) {
|
|
lines.push(`Tech Stack: ${config.site.techStack.join(', ')}`);
|
|
}
|
|
if (config.site.standards) {
|
|
lines.push(`Standards: ${config.site.standards.join(', ')}`);
|
|
}
|
|
if (config.site.components) {
|
|
lines.push(`Components: ${config.site.components.join(', ')}`);
|
|
}
|
|
if (config.site.software) {
|
|
lines.push(`Software: ${config.site.software.join(', ')}`);
|
|
}
|
|
|
|
lines.push('');
|
|
}
|
|
|
|
// Story section
|
|
if (config.story) {
|
|
lines.push('/* THE STORY */');
|
|
lines.push('');
|
|
lines.push(config.story);
|
|
lines.push('');
|
|
}
|
|
|
|
// Fun Facts section
|
|
if (config.funFacts && config.funFacts.length > 0) {
|
|
lines.push('/* FUN FACTS */');
|
|
lines.push('');
|
|
config.funFacts.forEach(fact => {
|
|
lines.push(`- ${fact}`);
|
|
});
|
|
lines.push('');
|
|
}
|
|
|
|
// Philosophy section
|
|
if (config.philosophy && config.philosophy.length > 0) {
|
|
lines.push('/* PHILOSOPHY */');
|
|
lines.push('');
|
|
config.philosophy.forEach(item => {
|
|
lines.push(`"${item}"`);
|
|
});
|
|
lines.push('');
|
|
}
|
|
|
|
// Custom sections
|
|
if (config.customSections) {
|
|
Object.entries(config.customSections).forEach(([title, content]) => {
|
|
lines.push(`/* ${title.toUpperCase()} */`);
|
|
lines.push('');
|
|
lines.push(content);
|
|
lines.push('');
|
|
});
|
|
}
|
|
|
|
return lines.join('\n');
|
|
}
|
|
```
|
|
|
|
### 6. API Route Template (`routes/robots.ts`)
|
|
|
|
```typescript
|
|
import type { APIRoute } from 'astro';
|
|
import { generateRobotsTxt } from '../generators/robots';
|
|
import { getConfig } from '../config';
|
|
|
|
export const GET: APIRoute = ({ site }) => {
|
|
const config = getConfig();
|
|
const siteURL = site || new URL('http://localhost:4321');
|
|
|
|
const content = config.templates?.robots
|
|
? config.templates.robots(config.robots, siteURL)
|
|
: generateRobotsTxt(config.robots, siteURL);
|
|
|
|
return new Response(content, {
|
|
status: 200,
|
|
headers: {
|
|
'Content-Type': 'text/plain; charset=utf-8',
|
|
'Cache-Control': `public, max-age=${config.caching?.robots || 3600}`,
|
|
},
|
|
});
|
|
};
|
|
```
|
|
|
|
## Testing Strategy
|
|
|
|
### Unit Tests
|
|
|
|
```typescript
|
|
// tests/robots.test.ts
|
|
import { describe, it, expect } from 'vitest';
|
|
import { generateRobotsTxt } from '../src/generators/robots';
|
|
|
|
describe('generateRobotsTxt', () => {
|
|
it('generates basic robots.txt', () => {
|
|
const result = generateRobotsTxt({}, new URL('https://example.com'));
|
|
expect(result).toContain('User-agent: *');
|
|
expect(result).toContain('Sitemap: https://example.com/sitemap-index.xml');
|
|
});
|
|
|
|
it('includes LLM bots when enabled', () => {
|
|
const result = generateRobotsTxt(
|
|
{ llmBots: { enabled: true } },
|
|
new URL('https://example.com')
|
|
);
|
|
expect(result).toContain('Anthropic-AI');
|
|
expect(result).toContain('GPTBot');
|
|
});
|
|
|
|
it('respects custom crawl delay', () => {
|
|
const result = generateRobotsTxt(
|
|
{ crawlDelay: 5 },
|
|
new URL('https://example.com')
|
|
);
|
|
expect(result).toContain('Crawl-delay: 5');
|
|
});
|
|
});
|
|
```
|
|
|
|
### Integration Tests
|
|
|
|
```typescript
|
|
// tests/integration.test.ts
|
|
import { describe, it, expect } from 'vitest';
|
|
import { testIntegration } from '@astrojs/test-utils';
|
|
import discovery from '../src/index';
|
|
|
|
describe('discovery integration', () => {
|
|
it('generates all discovery files', async () => {
|
|
const fixture = await testIntegration({
|
|
integrations: [discovery()],
|
|
site: 'https://example.com'
|
|
});
|
|
|
|
const files = await fixture.readdir('dist');
|
|
expect(files).toContain('robots.txt');
|
|
expect(files).toContain('llms.txt');
|
|
expect(files).toContain('humans.txt');
|
|
expect(files).toContain('sitemap-index.xml');
|
|
});
|
|
});
|
|
```
|
|
|
|
## Build & Publish
|
|
|
|
### package.json
|
|
|
|
```json
|
|
{
|
|
"name": "@astrojs/discovery",
|
|
"version": "1.0.0",
|
|
"description": "Complete discovery integration for Astro",
|
|
"type": "module",
|
|
"exports": {
|
|
".": "./dist/index.js",
|
|
"./routes/*": "./dist/routes/*"
|
|
},
|
|
"files": [
|
|
"dist",
|
|
"README.md"
|
|
],
|
|
"scripts": {
|
|
"build": "tsc",
|
|
"test": "vitest",
|
|
"prepublishOnly": "npm run build && npm test"
|
|
},
|
|
"peerDependencies": {
|
|
"astro": "^5.0.0"
|
|
},
|
|
"dependencies": {
|
|
"@astrojs/sitemap": "^3.6.0"
|
|
},
|
|
"devDependencies": {
|
|
"@astrojs/test-utils": "^1.0.0",
|
|
"typescript": "^5.3.0",
|
|
"vitest": "^1.0.0"
|
|
},
|
|
"keywords": [
|
|
"astro",
|
|
"astro-integration",
|
|
"robots",
|
|
"sitemap",
|
|
"llms",
|
|
"humans",
|
|
"discovery",
|
|
"seo"
|
|
]
|
|
}
|
|
```
|
|
|
|
## Future Enhancements
|
|
|
|
1. **security.txt Support** - Add RFC 9116 security.txt generation
|
|
2. **ads.txt Support** - For sites with advertising
|
|
3. **manifest.json Support** - PWA manifest generation
|
|
4. **RSS Feed Integration** - Optional RSS feed generation
|
|
5. **OpenGraph Tags** - Meta tag injection
|
|
6. **Structured Data** - JSON-LD schema.org markup
|
|
7. **Analytics Integration** - Built-in analytics discovery
|
|
8. **i18n Support** - Multi-language discovery files
|
|
|
|
## Resources
|
|
|
|
- [Astro Integration API](https://docs.astro.build/en/reference/integrations-reference/)
|
|
- [humanstxt.org](https://humanstxt.org/)
|
|
- [robots.txt spec](https://developers.google.com/search/docs/crawling-indexing/robots/intro)
|
|
- [llms.txt proposal](https://github.com/anthropics/llm-txt)
|
|
|
|
---
|
|
|
|
**This integration is a proposal. Implementation details may vary based on Astro's API evolution.**
|