Cost formulas
Per-model monthly cost
monthly_api_cost =
(queries × avg_input_tokens / 1M) × price_per_M_input
+ (queries × avg_output_tokens / 1M) × price_per_M_output
# Caching discount — up to 90% off input × cache_hit_rate
monthly_api_cost *= (1 - cache_hit_rate × 0.9)
# Batch discount — up to 50% off when "Batch-tolerant" is selected
monthly_api_cost *= (1 - batch_rate × 0.5)
# Self-hosted (open-weight SLMs)
self_hosted_cost =
monthly_gpu_cost_usd
+ (setup_effort_days × 8h × eng_hourly_rate_usd) / 12
# Effective cost by deployment mode
effective_monthly_cost = {
api: monthly_api_cost,
managed-inference: monthly_api_cost × 1.10,
self-hosted-gpu: self_hosted_cost,
on-prem: self_hosted_cost × 1.25,
air-gapped: self_hosted_cost × 1.40,
}[chosen_deployment_mode]Defaults: cache_hit_rate = 0; batch_rate = 1 when “Batch-tolerant” is selected, else 0; eng_hourly_rate_usd = 150.