astro-discovery/astro-discovery-implementation.md
Ryan Malloy d25dde4627 feat: initial implementation of @astrojs/discovery integration
This commit introduces a comprehensive Astro integration that automatically
generates discovery files for websites:

Features:
- robots.txt with LLM bot support (Anthropic-AI, GPTBot, etc.)
- llms.txt for AI assistant context and instructions
- humans.txt for team credits and site information
- Automatic sitemap integration via @astrojs/sitemap

Technical Details:
- TypeScript implementation with full type safety
- Configurable HTTP caching headers
- Custom template support for all generated files
- Sensible defaults with extensive customization options
- Date-based versioning (2025.11.03)

Testing:
- 34 unit tests covering all generators
- Test coverage for robots.txt, llms.txt, and humans.txt
- Integration with Vitest

Documentation:
- Comprehensive README with examples
- API reference documentation
- Contributing guidelines
- Example configurations (minimal and full)
2025-11-03 07:36:39 -07:00

17 KiB

@astrojs/discovery - Implementation Guide

Technical implementation details for building the Astro discovery integration

Package Structure

@astrojs/discovery/
├── package.json
├── README.md
├── LICENSE
├── tsconfig.json
├── src/
│   ├── index.ts                 # Main entry point
│   ├── types.ts                 # TypeScript definitions
│   ├── generators/
│   │   ├── robots.ts           # robots.txt generation
│   │   ├── llms.ts             # llms.txt generation
│   │   ├── humans.ts           # humans.txt generation
│   │   └── utils.ts            # Shared utilities
│   ├── templates/
│   │   ├── robots.template.ts
│   │   ├── llms.template.ts
│   │   └── humans.template.ts
│   └── validators/
│       └── config.ts            # Config validation
├── dist/                        # Built output
└── tests/
    ├── robots.test.ts
    ├── llms.test.ts
    ├── humans.test.ts
    └── integration.test.ts

Core Implementation

1. Main Integration File (src/index.ts)

import type { AstroIntegration } from 'astro';
import type { DiscoveryConfig } from './types';
import sitemap from '@astrojs/sitemap';
import { generateRobotsTxt } from './generators/robots';
import { generateLLMsTxt } from './generators/llms';
import { generateHumansTxt } from './generators/humans';
import { validateConfig } from './validators/config';

export default function discovery(
  userConfig: DiscoveryConfig = {}
): AstroIntegration {
  // Merge with defaults
  const config = validateConfig(userConfig);

  return {
    name: '@astrojs/discovery',
    hooks: {
      'astro:config:setup': ({ config: astroConfig, injectRoute, updateConfig }) => {
        // Ensure site is configured
        if (!astroConfig.site) {
          throw new Error(
            '@astrojs/discovery requires `site` to be set in astro.config.mjs'
          );
        }

        // Add sitemap integration
        updateConfig({
          integrations: [
            sitemap(config.sitemap || {})
          ]
        });

        // Inject dynamic routes for discovery files
        if (config.robots?.enabled !== false) {
          injectRoute({
            pattern: '/robots.txt',
            entrypoint: '@astrojs/discovery/routes/robots.ts',
            prerender: true
          });
        }

        if (config.llms?.enabled !== false) {
          injectRoute({
            pattern: '/llms.txt',
            entrypoint: '@astrojs/discovery/routes/llms.ts',
            prerender: true
          });
        }

        if (config.humans?.enabled !== false) {
          injectRoute({
            pattern: '/humans.txt',
            entrypoint: '@astrojs/discovery/routes/humans.ts',
            prerender: true
          });
        }
      },

      'astro:build:done': ({ dir, routes }) => {
        // Post-build validation
        console.log('✅ Discovery files generated:');
        if (config.robots?.enabled !== false) console.log('  - /robots.txt');
        if (config.llms?.enabled !== false) console.log('  - /llms.txt');
        if (config.humans?.enabled !== false) console.log('  - /humans.txt');
        console.log('  - /sitemap-index.xml');
      }
    }
  };
}

// Named exports
export type { DiscoveryConfig } from './types';

2. Type Definitions (src/types.ts)

export interface DiscoveryConfig {
  robots?: RobotsConfig;
  llms?: LLMsConfig;
  humans?: HumansConfig;
  sitemap?: SitemapConfig;
  caching?: CachingConfig;
  templates?: TemplateConfig;
}

export interface RobotsConfig {
  enabled?: boolean;
  crawlDelay?: number;
  allowAllBots?: boolean;
  llmBots?: {
    enabled?: boolean;
    agents?: string[];
  };
  additionalAgents?: Array<{
    userAgent: string;
    allow?: string[];
    disallow?: string[];
  }>;
  customRules?: string;
}

export interface LLMsConfig {
  enabled?: boolean;
  description?: string | (() => string);
  keyFeatures?: string[];
  importantPages?: ImportantPage[] | (() => Promise<ImportantPage[]>);
  instructions?: string;
  apiEndpoints?: APIEndpoint[];
  techStack?: TechStack;
  brandVoice?: string[];
  customSections?: Record<string, string>;
}

export interface HumansConfig {
  enabled?: boolean;
  team?: TeamMember[];
  thanks?: string[];
  site?: SiteInfo;
  story?: string;
  funFacts?: string[];
  philosophy?: string[];
  customSections?: Record<string, string>;
}

export interface SitemapConfig {
  filter?: (page: string) => boolean;
  customPages?: string[];
  changefreq?: 'always' | 'hourly' | 'daily' | 'weekly' | 'monthly' | 'yearly' | 'never';
  priority?: number;
}

export interface CachingConfig {
  robots?: number;
  llms?: number;
  humans?: number;
  sitemap?: number;
}

export interface TemplateConfig {
  robots?: (config: RobotsConfig, siteURL: URL) => string;
  llms?: (config: LLMsConfig, siteURL: URL) => string;
  humans?: (config: HumansConfig, siteURL: URL) => string;
}

export interface ImportantPage {
  name: string;
  path: string;
  description?: string;
}

export interface APIEndpoint {
  path: string;
  method?: string;
  description: string;
}

export interface TechStack {
  frontend?: string[];
  backend?: string[];
  ai?: string[];
  other?: string[];
}

export interface TeamMember {
  name: string;
  role?: string;
  contact?: string;
  location?: string;
  twitter?: string;
  github?: string;
}

export interface SiteInfo {
  lastUpdate?: string | 'auto';
  language?: string;
  doctype?: string;
  ide?: string;
  techStack?: string[];
  standards?: string[];
  components?: string[];
  software?: string[];
}

3. Robots.txt Generator (src/generators/robots.ts)

import type { RobotsConfig } from '../types';

const DEFAULT_LLM_BOTS = [
  'Anthropic-AI',
  'Claude-Web',
  'GPTBot',
  'ChatGPT-User',
  'cohere-ai',
  'Google-Extended'
];

export function generateRobotsTxt(
  config: RobotsConfig,
  siteURL: URL
): string {
  const lines: string[] = [];

  // Allow all bots by default
  if (config.allowAllBots !== false) {
    lines.push('User-agent: *');
    lines.push('Allow: /');
    lines.push('');
  }

  // Add sitemap
  lines.push('# Sitemaps');
  lines.push(`Sitemap: ${new URL('sitemap-index.xml', siteURL).href}`);
  lines.push('');

  // LLM-specific rules
  if (config.llmBots?.enabled !== false) {
    lines.push('# LLM-specific resources');
    lines.push('# See: https://github.com/anthropics/llm-txt');

    const agents = config.llmBots?.agents || DEFAULT_LLM_BOTS;
    agents.forEach(agent => {
      lines.push(`User-agent: ${agent}`);
    });
    lines.push('Allow: /llms.txt');
    lines.push('');
  }

  // Additional agent rules
  if (config.additionalAgents) {
    config.additionalAgents.forEach(agent => {
      lines.push(`User-agent: ${agent.userAgent}`);

      if (agent.allow) {
        agent.allow.forEach(path => {
          lines.push(`Allow: ${path}`);
        });
      }

      if (agent.disallow) {
        agent.disallow.forEach(path => {
          lines.push(`Disallow: ${path}`);
        });
      }

      lines.push('');
    });
  }

  // Crawl delay
  if (config.crawlDelay) {
    lines.push('# Crawl delay (be nice to our server)');
    lines.push(`Crawl-delay: ${config.crawlDelay}`);
    lines.push('');
  }

  // Custom rules
  if (config.customRules) {
    lines.push('# Custom rules');
    lines.push(config.customRules);
    lines.push('');
  }

  return lines.join('\n');
}

4. LLMs.txt Generator (src/generators/llms.ts)

import type { LLMsConfig, ImportantPage } from '../types';

export async function generateLLMsTxt(
  config: LLMsConfig,
  siteURL: URL
): Promise<string> {
  const lines: string[] = [];

  // Header
  const description = typeof config.description === 'function'
    ? config.description()
    : config.description;

  lines.push(`# ${siteURL.hostname}`);
  if (description) {
    lines.push('');
    lines.push(`> ${description}`);
  }
  lines.push('');
  lines.push('---');
  lines.push('');

  // Site Information
  lines.push('## Site Information');
  lines.push('');
  lines.push(`- **URL**: ${siteURL.href}`);
  if (description) {
    lines.push(`- **Description**: ${description}`);
  }
  lines.push('');

  // Key Features
  if (config.keyFeatures && config.keyFeatures.length > 0) {
    lines.push('## Key Features');
    lines.push('');
    config.keyFeatures.forEach(feature => {
      lines.push(`- ${feature}`);
    });
    lines.push('');
  }

  // Important Pages
  if (config.importantPages) {
    const pages = typeof config.importantPages === 'function'
      ? await config.importantPages()
      : config.importantPages;

    if (pages.length > 0) {
      lines.push('## Important Pages');
      lines.push('');
      pages.forEach(page => {
        const url = new URL(page.path, siteURL).href;
        lines.push(`- **${page.name}**: ${url}`);
        if (page.description) {
          lines.push(`  ${page.description}`);
        }
      });
      lines.push('');
    }
  }

  // Instructions for AI Assistants
  if (config.instructions) {
    lines.push('## For AI Assistants');
    lines.push('');
    lines.push(config.instructions);
    lines.push('');
  }

  // API Endpoints
  if (config.apiEndpoints && config.apiEndpoints.length > 0) {
    lines.push('## API Endpoints');
    lines.push('');
    config.apiEndpoints.forEach(endpoint => {
      const method = endpoint.method || 'GET';
      lines.push(`- \`${method} ${endpoint.path}\` - ${endpoint.description}`);
    });
    lines.push('');
  }

  // Tech Stack
  if (config.techStack) {
    lines.push('## Technical Stack');
    lines.push('');
    if (config.techStack.frontend) {
      lines.push(`- **Frontend**: ${config.techStack.frontend.join(', ')}`);
    }
    if (config.techStack.backend) {
      lines.push(`- **Backend**: ${config.techStack.backend.join(', ')}`);
    }
    if (config.techStack.ai) {
      lines.push(`- **AI**: ${config.techStack.ai.join(', ')}`);
    }
    if (config.techStack.other) {
      lines.push(`- **Other**: ${config.techStack.other.join(', ')}`);
    }
    lines.push('');
  }

  // Brand Voice
  if (config.brandVoice && config.brandVoice.length > 0) {
    lines.push('## Brand Voice');
    lines.push('');
    config.brandVoice.forEach(item => {
      lines.push(`- ${item}`);
    });
    lines.push('');
  }

  // Custom Sections
  if (config.customSections) {
    Object.entries(config.customSections).forEach(([title, content]) => {
      lines.push(`## ${title}`);
      lines.push('');
      lines.push(content);
      lines.push('');
    });
  }

  // Footer
  lines.push('---');
  lines.push('');
  lines.push(`Last Updated: ${new Date().toISOString().split('T')[0]}`);

  return lines.join('\n');
}

5. Humans.txt Generator (src/generators/humans.ts)

import type { HumansConfig } from '../types';

export function generateHumansTxt(config: HumansConfig): string {
  const lines: string[] = [];

  // Team section
  if (config.team && config.team.length > 0) {
    lines.push('/* TEAM */');
    lines.push('');

    config.team.forEach((member, index) => {
      if (index > 0) lines.push('');
      lines.push(`Name: ${member.name}`);
      if (member.role) lines.push(`Role: ${member.role}`);
      if (member.contact) lines.push(`Contact: ${member.contact}`);
      if (member.location) lines.push(`From: ${member.location}`);
      if (member.twitter) lines.push(`Twitter: ${member.twitter}`);
      if (member.github) lines.push(`GitHub: ${member.github}`);
    });

    lines.push('');
  }

  // Thanks section
  if (config.thanks && config.thanks.length > 0) {
    lines.push('/* THANKS */');
    lines.push('');
    config.thanks.forEach(thanks => {
      lines.push(`- ${thanks}`);
    });
    lines.push('');
  }

  // Site section
  if (config.site) {
    lines.push('/* SITE */');
    lines.push('');

    const lastUpdate = config.site.lastUpdate === 'auto'
      ? new Date().toISOString().split('T')[0]
      : config.site.lastUpdate;

    if (lastUpdate) lines.push(`Last update: ${lastUpdate}`);
    if (config.site.language) lines.push(`Language: ${config.site.language}`);
    if (config.site.doctype) lines.push(`Doctype: ${config.site.doctype}`);
    if (config.site.ide) lines.push(`IDE: ${config.site.ide}`);

    if (config.site.techStack) {
      lines.push(`Tech Stack: ${config.site.techStack.join(', ')}`);
    }
    if (config.site.standards) {
      lines.push(`Standards: ${config.site.standards.join(', ')}`);
    }
    if (config.site.components) {
      lines.push(`Components: ${config.site.components.join(', ')}`);
    }
    if (config.site.software) {
      lines.push(`Software: ${config.site.software.join(', ')}`);
    }

    lines.push('');
  }

  // Story section
  if (config.story) {
    lines.push('/* THE STORY */');
    lines.push('');
    lines.push(config.story);
    lines.push('');
  }

  // Fun Facts section
  if (config.funFacts && config.funFacts.length > 0) {
    lines.push('/* FUN FACTS */');
    lines.push('');
    config.funFacts.forEach(fact => {
      lines.push(`- ${fact}`);
    });
    lines.push('');
  }

  // Philosophy section
  if (config.philosophy && config.philosophy.length > 0) {
    lines.push('/* PHILOSOPHY */');
    lines.push('');
    config.philosophy.forEach(item => {
      lines.push(`"${item}"`);
    });
    lines.push('');
  }

  // Custom sections
  if (config.customSections) {
    Object.entries(config.customSections).forEach(([title, content]) => {
      lines.push(`/* ${title.toUpperCase()} */`);
      lines.push('');
      lines.push(content);
      lines.push('');
    });
  }

  return lines.join('\n');
}

6. API Route Template (routes/robots.ts)

import type { APIRoute } from 'astro';
import { generateRobotsTxt } from '../generators/robots';
import { getConfig } from '../config';

export const GET: APIRoute = ({ site }) => {
  const config = getConfig();
  const siteURL = site || new URL('http://localhost:4321');

  const content = config.templates?.robots
    ? config.templates.robots(config.robots, siteURL)
    : generateRobotsTxt(config.robots, siteURL);

  return new Response(content, {
    status: 200,
    headers: {
      'Content-Type': 'text/plain; charset=utf-8',
      'Cache-Control': `public, max-age=${config.caching?.robots || 3600}`,
    },
  });
};

Testing Strategy

Unit Tests

// tests/robots.test.ts
import { describe, it, expect } from 'vitest';
import { generateRobotsTxt } from '../src/generators/robots';

describe('generateRobotsTxt', () => {
  it('generates basic robots.txt', () => {
    const result = generateRobotsTxt({}, new URL('https://example.com'));
    expect(result).toContain('User-agent: *');
    expect(result).toContain('Sitemap: https://example.com/sitemap-index.xml');
  });

  it('includes LLM bots when enabled', () => {
    const result = generateRobotsTxt(
      { llmBots: { enabled: true } },
      new URL('https://example.com')
    );
    expect(result).toContain('Anthropic-AI');
    expect(result).toContain('GPTBot');
  });

  it('respects custom crawl delay', () => {
    const result = generateRobotsTxt(
      { crawlDelay: 5 },
      new URL('https://example.com')
    );
    expect(result).toContain('Crawl-delay: 5');
  });
});

Integration Tests

// tests/integration.test.ts
import { describe, it, expect } from 'vitest';
import { testIntegration } from '@astrojs/test-utils';
import discovery from '../src/index';

describe('discovery integration', () => {
  it('generates all discovery files', async () => {
    const fixture = await testIntegration({
      integrations: [discovery()],
      site: 'https://example.com'
    });

    const files = await fixture.readdir('dist');
    expect(files).toContain('robots.txt');
    expect(files).toContain('llms.txt');
    expect(files).toContain('humans.txt');
    expect(files).toContain('sitemap-index.xml');
  });
});

Build & Publish

package.json

{
  "name": "@astrojs/discovery",
  "version": "1.0.0",
  "description": "Complete discovery integration for Astro",
  "type": "module",
  "exports": {
    ".": "./dist/index.js",
    "./routes/*": "./dist/routes/*"
  },
  "files": [
    "dist",
    "README.md"
  ],
  "scripts": {
    "build": "tsc",
    "test": "vitest",
    "prepublishOnly": "npm run build && npm test"
  },
  "peerDependencies": {
    "astro": "^5.0.0"
  },
  "dependencies": {
    "@astrojs/sitemap": "^3.6.0"
  },
  "devDependencies": {
    "@astrojs/test-utils": "^1.0.0",
    "typescript": "^5.3.0",
    "vitest": "^1.0.0"
  },
  "keywords": [
    "astro",
    "astro-integration",
    "robots",
    "sitemap",
    "llms",
    "humans",
    "discovery",
    "seo"
  ]
}

Future Enhancements

  1. security.txt Support - Add RFC 9116 security.txt generation
  2. ads.txt Support - For sites with advertising
  3. manifest.json Support - PWA manifest generation
  4. RSS Feed Integration - Optional RSS feed generation
  5. OpenGraph Tags - Meta tag injection
  6. Structured Data - JSON-LD schema.org markup
  7. Analytics Integration - Built-in analytics discovery
  8. i18n Support - Multi-language discovery files

Resources


This integration is a proposal. Implementation details may vary based on Astro's API evolution.