oh-my-claudecode/benchmark/.env.example

# SWE-bench Evaluation Configuration
# Copy this file to .env and fill in your values

# Required: Anthropic authentication token for Claude Code
ANTHROPIC_AUTH_TOKEN=your_token_here

# Optional: Custom Anthropic API base URL
ANTHROPIC_BASE_URL=https://api.layofflabs.com

# Run mode: 'vanilla' for standard Claude Code, 'omc' for oh-my-claudecode enhanced
RUN_MODE=vanilla

# Maximum parallel workers for evaluation
MAX_WORKERS=4

# Dataset to evaluate against
# Options:
#   - princeton-nlp/SWE-bench_Verified (300 curated instances, recommended)
#   - princeton-nlp/SWE-bench_Lite (300 instances, easier subset)
#   - princeton-nlp/SWE-bench (full 2294 instances)
DATASET=princeton-nlp/SWE-bench_Verified

# Optional: Subset of instances to run (comma-separated instance IDs)
# INSTANCE_IDS=django__django-11099,sympy__sympy-18057

# Optional: Maximum instances to evaluate (useful for testing)
# MAX_INSTANCES=10

# Optional: Timeout per instance in seconds (default: 1800 = 30 minutes)
# INSTANCE_TIMEOUT=1800

# Optional: Model to use (default: claude-sonnet-4-20250514)
# MODEL=claude-sonnet-4-20250514

# Optional: Enable verbose logging
# VERBOSE=true