Skip to content

Programmatic API

agent-eval-kit can be used programmatically in addition to the CLI. All core functions are exported from the main package.

import { loadConfig, runSuite, saveRun } from "agent-eval-kit";
const config = await loadConfig({ cwd: process.cwd() });
for (const suite of config.suites) {
const run = await runSuite(suite, {
mode: "replay",
timeoutMs: config.run.timeoutMs,
judge: config.judge?.call,
plugins: config.plugins,
});
await saveRun(run);
console.log(`${suite.name}: ${run.summary.passRate * 100}% pass rate`);
}
import { loadConfig, defineConfig } from "agent-eval-kit";
// Load from eval.config.ts (auto-detected)
const config = await loadConfig();
// Load from a specific directory
const config = await loadConfig({ cwd: "/path/to/project" });
// Load from a specific config path
const config = await loadConfig({ configPath: "custom.config.ts" });

loadConfig returns a ValidatedConfig with resolved cases, defaults applied, and plugins validated.

import { saveRun, loadRun, listRuns } from "agent-eval-kit";
// Save a run result
const path = await saveRun(run); // saves to .eval-runs/<id>.json
const path = await saveRun(run, "my-dir"); // custom directory
// Load a specific run
const run = await loadRun("run-id");
// List recent runs
const runs = await listRuns(); // RunMeta[] sorted newest-first
// Each: { id, suiteId, mode, timestamp, passRate }
import { compareRuns, formatComparisonReport } from "agent-eval-kit/comparison";
import { loadRun } from "agent-eval-kit";
const base = await loadRun("base-run-id");
const compare = await loadRun("compare-run-id");
const comparison = compareRuns(base, compare, { scoreThreshold: 0.05 });
const report = formatComparisonReport(comparison, { color: false, verbose: true });
console.log(report);
import {
formatConsoleReport,
formatJsonReport,
formatJunitXml,
formatMarkdownReport,
formatMarkdownSummary,
} from "agent-eval-kit";
// Console (human-readable)
const text = formatConsoleReport(run, { color: true, verbose: false });
// JSON (full Run object)
const json = formatJsonReport(run);
// JUnit XML
const xml = formatJunitXml(run);
// Markdown tables
const md = formatMarkdownReport(run);
// Markdown summary (compact, suitable for PR comments)
const summary = formatMarkdownSummary(run);
import {
createCachingJudge,
createDiskCachingJudge,
clearJudgeCache,
judgeCacheStats,
} from "agent-eval-kit";
// In-memory cache (process lifetime)
const cached = createCachingJudge(myJudgeFn, { maxEntries: 1000 });
// Disk cache (persists across runs)
const diskCached = createDiskCachingJudge(myJudgeFn, {
cacheDir: ".eval-cache/judge",
ttlDays: 7,
maxEntries: 10_000,
});
// Cache management
await clearJudgeCache();
const stats = await judgeCacheStats();
// stats: { entries: number, totalBytes: number }
import { generateRunId } from "agent-eval-kit";
// Generate a timestamped run ID: run-YYYYMMDD-HHmmss-XXXX
const id = generateRunId();
// e.g. "run-20260302-143022-a7f3"
// Optionally pass a Date for deterministic IDs in tests
const id = generateRunId(new Date("2026-01-15T10:30:00Z"));
import { createTokenBucketLimiter } from "agent-eval-kit";
const limiter = createTokenBucketLimiter({ maxRequestsPerMinute: 60 });
// Acquire a token before each request (waits if rate-limited)
await limiter.acquire(signal);
// Clean up when done
limiter.dispose();
import { estimateCost } from "agent-eval-kit";
const estimate = estimateCost(suite, { mode: "live", trials: 3 });
// { judgeCalls: number, targetCalls: number, summary: string }

Does not estimate dollar cost — returns call counts so you can calculate based on your provider’s pricing.

import { computeAllTrialStats, computeTrialStats, wilsonInterval } from "agent-eval-kit";
// Compute per-case trial statistics (returns undefined if trialCount <= 1)
const stats = computeAllTrialStats(trials, trialCount);
// Compute stats for a single case across trials
const caseStats = computeTrialStats(trials, "case-id");
// caseStats: { trialCount, passCount, failCount, errorCount, passRate, meanScore, scoreStdDev, ci95Low, ci95High, flaky }
// Compute a Wilson score interval
const interval = wilsonInterval(successes, total, 1.96);
// interval.low, interval.high
import { createFileWatcher } from "agent-eval-kit";
const watcher = createFileWatcher({
paths: ["/path/to/project"],
debounceMs: 300,
});
watcher.on("change", async (files) => {
console.log("Changed:", files);
// Re-run evals
});
// Later: clean up
await watcher.close();
import { createProgressPlugin } from "agent-eval-kit";
const progress = createProgressPlugin({ noColor: false });
// Use as a plugin: plugins: [progress]
// Streams per-trial results to stderr on TTY:
// ✓ case-id 420ms
// ✗ case-id 1.2s
// 3/10 (30%) ← counter line, overwritten each trial

All 20 built-in graders, composition operators, and their option types are available from the root package:

import {
// Text
contains, notContains, exactMatch, regex, jsonSchema,
// Tool calls
toolCalled, toolNotCalled, toolSequence, toolArgsMatch,
// Metrics
latency, cost, tokenCount,
// Safety
safetyKeywords, noHallucinatedNumbers,
// LLM
llmRubric, factuality, llmClassify,
// Composition
all, any, not,
} from "agent-eval-kit";

The agent-eval-kit/graders subpath re-exports the same graders plus scoring internals (computeCaseResult). See the Graders API for full documentation.

All types are exported from the main package or subpath exports:

// Core types
import type {
EvalConfig,
SuiteConfig,
Case,
CaseInput,
CaseExpected,
TargetOutput,
Run,
Trial,
RunSummary,
RunOptions,
GateConfig,
} from "agent-eval-kit";
// Grader types
import type {
GraderFn,
GraderConfig,
GraderContext,
GraderFactory,
GradeResult,
CaseResult,
} from "agent-eval-kit"; // also available from "agent-eval-kit/graders"
// Judge types
import type {
JudgeCallFn,
JudgeCallOptions,
JudgeConfig,
JudgeMessage,
JudgeResponse,
} from "agent-eval-kit";
// Plugin types
import type {
EvalPlugin,
PluginHooks,
BeforeRunContext,
AfterTrialContext,
} from "agent-eval-kit/plugin";