Files
oh-my-claudecode/benchmark/run_vanilla.sh
Yeachan-Heo df0f27071f Keep agent guidance and benchmark defaults aligned with OMC
The root AGENTS contract had drifted toward OMX/Codex wording and state
paths, which made the project-level guidance inconsistent with the actual
OMC runtime. The benchmark suite also carried split default model strings
across shell wrappers, the Python runner, and results docs, so this cleanup
re-aligned the suite on one current Sonnet 4.6 default and added a narrow
contract test to catch future regressions.

Constraint: Limit the cleanup to stale OMC-vs-OMX references and benchmark model strings
Rejected: Regenerate broader docs/templates wholesale | unnecessary scope for a targeted cleanup issue
Confidence: high
Scope-risk: narrow
Reversibility: clean
Directive: Keep root AGENTS branding/state paths in sync with OMC runtime contracts and update benchmark defaults in one place when the benchmark model changes
Tested: ./node_modules/.bin/vitest run src/__tests__/tier0-docs-consistency.test.ts src/__tests__/hooks.test.ts src/config/__tests__/loader.test.ts
Tested: python3 -m py_compile benchmark/run_benchmark.py
Tested: bash -n benchmark/quick_test.sh benchmark/run_vanilla.sh benchmark/run_omc.sh benchmark/run_full_comparison.sh
Not-tested: Full benchmark execution against live Anthropic/SWE-bench infrastructure
2026-03-21 01:10:33 +00:00

139 lines
3.5 KiB
Bash
Executable File

#!/bin/bash
set -euo pipefail
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
log_info() {
echo -e "${GREEN}[INFO]${NC} $1"
}
log_warn() {
echo -e "${YELLOW}[WARN]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
log_step() {
echo -e "${BLUE}[STEP]${NC} $1"
}
# Script directory
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
# Configuration
RUN_MODE="vanilla"
PREDICTIONS_DIR="$SCRIPT_DIR/predictions/$RUN_MODE"
LOGS_DIR="$SCRIPT_DIR/logs"
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
LOG_FILE="$LOGS_DIR/${RUN_MODE}_${TIMESTAMP}.log"
# Parse arguments
LIMIT=""
SKIP=""
MODEL="claude-sonnet-4-6-20260217"
TIMEOUT="300"
while [[ $# -gt 0 ]]; do
case $1 in
--limit)
LIMIT="$2"
shift 2
;;
--skip)
SKIP="$2"
shift 2
;;
--model)
MODEL="$2"
shift 2
;;
--timeout)
TIMEOUT="$2"
shift 2
;;
-h|--help)
echo "Usage: $0 [OPTIONS]"
echo ""
echo "Options:"
echo " --limit N Limit to N instances (default: all)"
echo " --skip N Skip first N instances (default: 0)"
echo " --model MODEL Claude model to use (default: claude-sonnet-4-6-20260217)"
echo " --timeout SECS Timeout per instance (default: 300)"
echo " -h, --help Show this help message"
exit 0
;;
*)
log_error "Unknown option: $1"
exit 1
;;
esac
done
# Verify API key (check both possible env var names)
if [ -z "${ANTHROPIC_AUTH_TOKEN:-}" ] && [ -z "${ANTHROPIC_API_KEY:-}" ]; then
log_error "ANTHROPIC_AUTH_TOKEN is not set. Please export it."
exit 1
fi
log_info "=========================================="
log_info "Running VANILLA Claude Code Benchmark"
log_info "=========================================="
log_info "Mode: $RUN_MODE"
log_info "Model: $MODEL"
log_info "Timeout: ${TIMEOUT}s per instance"
[ -n "$LIMIT" ] && log_info "Limit: $LIMIT instances"
[ -n "$SKIP" ] && log_info "Skip: $SKIP instances"
log_info "Output: $PREDICTIONS_DIR"
log_info "Log: $LOG_FILE"
log_info ""
# Create directories
mkdir -p "$PREDICTIONS_DIR"
mkdir -p "$LOGS_DIR"
# Build command
CMD="python3 $SCRIPT_DIR/run_benchmark.py"
CMD="$CMD --mode $RUN_MODE"
CMD="$CMD --model $MODEL"
CMD="$CMD --timeout $TIMEOUT"
CMD="$CMD --output-dir $PREDICTIONS_DIR"
[ -n "$LIMIT" ] && CMD="$CMD --limit $LIMIT"
[ -n "$SKIP" ] && CMD="$CMD --skip $SKIP"
log_step "Starting benchmark run..."
log_info "Command: $CMD"
log_info ""
# Run benchmark with tee for live output and logging
$CMD 2>&1 | tee "$LOG_FILE"
EXIT_CODE=${PIPESTATUS[0]}
echo ""
if [ $EXIT_CODE -eq 0 ]; then
log_info "=========================================="
log_info "Benchmark completed successfully!"
log_info "=========================================="
log_info "Results: $PREDICTIONS_DIR"
log_info "Log: $LOG_FILE"
log_info ""
log_info "Next steps:"
log_info " 1. Run evaluation: python3 evaluate.py --predictions $PREDICTIONS_DIR"
log_info " 2. Compare with OMC: ./run_omc.sh"
log_info ""
else
log_error "=========================================="
log_error "Benchmark failed with exit code: $EXIT_CODE"
log_error "=========================================="
log_error "Check log file: $LOG_FILE"
exit $EXIT_CODE
fi