curl -H "Authorization: Bearer $CEREBRAS_API_KEY" \
https://cloud.cerebras.ai/api/v1/metrics/organizations/org_abc123
# HELP inference_endpoint_status Status of inference endpoint (-1=error calculating status, 0=down, 1=up)
# TYPE inference_endpoint_status gauge
inference_endpoint_status{endpoint="model",organization_id="org_abc123"} 1.0
# HELP requests_count_total Total request count (all HTTP codes) in the last complete minute
# TYPE requests_count_total gauge
requests_count_total{endpoint="model",organization_id="org_abc123"} 1.0
# HELP requests_success_total Total successful requests (HTTP 200) in the last complete minute
# TYPE requests_success_total gauge
requests_success_total{endpoint="model",organization_id="org_abc123"} 1.0
# HELP requests_failure_total Total failed requests by HTTP code in the last complete minute
# TYPE requests_failure_total gauge
requests_failure_total{endpoint="model",organization_id="org_abc123"} 0.0
# HELP input_tokens_total Total input tokens for successful requests in the last complete minute
# TYPE input_tokens_total gauge
input_tokens_total{endpoint="model",organization_id="org_abc123"} 123456.0
# HELP output_tokens_total Total output tokens for successful requests in the last complete minute
# TYPE output_tokens_total gauge
output_tokens_total{endpoint="model",organization_id="org_abc123"} 12345.0
# HELP queue_time_seconds Queue time percentiles in seconds
# TYPE queue_time_seconds gauge
queue_time_seconds{endpoint="model",organization_id="org_abc123",percentile="avg"} 0.55
queue_time_seconds{endpoint="model",organization_id="org_abc123",percentile="p50"} 0.55
queue_time_seconds{endpoint="model",organization_id="org_abc123",percentile="p90"} 0.55
queue_time_seconds{endpoint="model",organization_id="org_abc123",percentile="p95"} 0.55
queue_time_seconds{endpoint="model",organization_id="org_abc123",percentile="p99"} 0.55
# HELP e2e_latency_seconds End-to-end API latency percentiles in seconds
# TYPE e2e_latency_seconds gauge
e2e_latency_seconds{endpoint="model",organization_id="org_abc123",statistic="avg"} 0.9
e2e_latency_seconds{endpoint="model",organization_id="org_abc123",statistic="p50"} 0.9
e2e_latency_seconds{endpoint="model",organization_id="org_abc123",statistic="p90"} 0.9
e2e_latency_seconds{endpoint="model",organization_id="org_abc123",statistic="p95"} 0.9
e2e_latency_seconds{endpoint="model",organization_id="org_abc123",statistic="p99"} 0.9
# HELP ttft_seconds Time To First Token percentiles in seconds
# TYPE ttft_seconds gauge
ttft_seconds{endpoint="model",organization_id="org_abc123",statistic="avg"} 0.9
ttft_seconds{endpoint="model",organization_id="org_abc123",statistic="p50"} 0.9
ttft_seconds{endpoint="model",organization_id="org_abc123",statistic="p90"} 0.9
ttft_seconds{endpoint="model",organization_id="org_abc123",statistic="p95"} 0.9
ttft_seconds{endpoint="model",organization_id="org_abc123",statistic="p99"} 0.9
# HELP cache_reads_total Total input tokens read from cache for successful requests in the last complete minute
# TYPE cache_reads_total gauge
cache_reads_total{endpoint="model",organization_id="org_abc123"} 1234.0
# HELP cache_rate Ratio of input tokens read from cache, to total input tokens, for successful requests in the last complete minute
# TYPE cache_rate gauge
cache_rate{endpoint="model",organization_id="org_abc123"} 0.01
# HELP tpot Time Per Output Token (TPOT) percentiles
# TYPE tpot gauge
tpot{endpoint="model",organization_id="org_abc123",statistic="avg"} 0.0001
tpot{endpoint="model",organization_id="org_abc123",statistic="p50"} 0.0001
tpot{endpoint="model",organization_id="org_abc123",statistic="p90"} 0.0001
tpot{endpoint="model",organization_id="org_abc123",statistic="p95"} 0.0001
tpot{endpoint="model",organization_id="org_abc123",statistic="p99"} 0.0001
# HELP latency_generation_seconds Completion time percentiles in seconds
# TYPE latency_generation_seconds gauge
latency_generation_seconds{endpoint="model",organization_id="org_abc123",statistic="avg"} 1.1
latency_generation_seconds{endpoint="model",organization_id="org_abc123",statistic="p50"} 1.1
latency_generation_seconds{endpoint="model",organization_id="org_abc123",statistic="p90"} 1.1
latency_generation_seconds{endpoint="model",organization_id="org_abc123",statistic="p95"} 1.1
latency_generation_seconds{endpoint="model",organization_id="org_abc123",statistic="p99"} 1.1
Retrieve operational metrics for your organization’s inference endpoints in Prometheus format.
curl -H "Authorization: Bearer $CEREBRAS_API_KEY" \
https://cloud.cerebras.ai/api/v1/metrics/organizations/org_abc123
# HELP inference_endpoint_status Status of inference endpoint (-1=error calculating status, 0=down, 1=up)
# TYPE inference_endpoint_status gauge
inference_endpoint_status{endpoint="model",organization_id="org_abc123"} 1.0
# HELP requests_count_total Total request count (all HTTP codes) in the last complete minute
# TYPE requests_count_total gauge
requests_count_total{endpoint="model",organization_id="org_abc123"} 1.0
# HELP requests_success_total Total successful requests (HTTP 200) in the last complete minute
# TYPE requests_success_total gauge
requests_success_total{endpoint="model",organization_id="org_abc123"} 1.0
# HELP requests_failure_total Total failed requests by HTTP code in the last complete minute
# TYPE requests_failure_total gauge
requests_failure_total{endpoint="model",organization_id="org_abc123"} 0.0
# HELP input_tokens_total Total input tokens for successful requests in the last complete minute
# TYPE input_tokens_total gauge
input_tokens_total{endpoint="model",organization_id="org_abc123"} 123456.0
# HELP output_tokens_total Total output tokens for successful requests in the last complete minute
# TYPE output_tokens_total gauge
output_tokens_total{endpoint="model",organization_id="org_abc123"} 12345.0
# HELP queue_time_seconds Queue time percentiles in seconds
# TYPE queue_time_seconds gauge
queue_time_seconds{endpoint="model",organization_id="org_abc123",percentile="avg"} 0.55
queue_time_seconds{endpoint="model",organization_id="org_abc123",percentile="p50"} 0.55
queue_time_seconds{endpoint="model",organization_id="org_abc123",percentile="p90"} 0.55
queue_time_seconds{endpoint="model",organization_id="org_abc123",percentile="p95"} 0.55
queue_time_seconds{endpoint="model",organization_id="org_abc123",percentile="p99"} 0.55
# HELP e2e_latency_seconds End-to-end API latency percentiles in seconds
# TYPE e2e_latency_seconds gauge
e2e_latency_seconds{endpoint="model",organization_id="org_abc123",statistic="avg"} 0.9
e2e_latency_seconds{endpoint="model",organization_id="org_abc123",statistic="p50"} 0.9
e2e_latency_seconds{endpoint="model",organization_id="org_abc123",statistic="p90"} 0.9
e2e_latency_seconds{endpoint="model",organization_id="org_abc123",statistic="p95"} 0.9
e2e_latency_seconds{endpoint="model",organization_id="org_abc123",statistic="p99"} 0.9
# HELP ttft_seconds Time To First Token percentiles in seconds
# TYPE ttft_seconds gauge
ttft_seconds{endpoint="model",organization_id="org_abc123",statistic="avg"} 0.9
ttft_seconds{endpoint="model",organization_id="org_abc123",statistic="p50"} 0.9
ttft_seconds{endpoint="model",organization_id="org_abc123",statistic="p90"} 0.9
ttft_seconds{endpoint="model",organization_id="org_abc123",statistic="p95"} 0.9
ttft_seconds{endpoint="model",organization_id="org_abc123",statistic="p99"} 0.9
# HELP cache_reads_total Total input tokens read from cache for successful requests in the last complete minute
# TYPE cache_reads_total gauge
cache_reads_total{endpoint="model",organization_id="org_abc123"} 1234.0
# HELP cache_rate Ratio of input tokens read from cache, to total input tokens, for successful requests in the last complete minute
# TYPE cache_rate gauge
cache_rate{endpoint="model",organization_id="org_abc123"} 0.01
# HELP tpot Time Per Output Token (TPOT) percentiles
# TYPE tpot gauge
tpot{endpoint="model",organization_id="org_abc123",statistic="avg"} 0.0001
tpot{endpoint="model",organization_id="org_abc123",statistic="p50"} 0.0001
tpot{endpoint="model",organization_id="org_abc123",statistic="p90"} 0.0001
tpot{endpoint="model",organization_id="org_abc123",statistic="p95"} 0.0001
tpot{endpoint="model",organization_id="org_abc123",statistic="p99"} 0.0001
# HELP latency_generation_seconds Completion time percentiles in seconds
# TYPE latency_generation_seconds gauge
latency_generation_seconds{endpoint="model",organization_id="org_abc123",statistic="avg"} 1.1
latency_generation_seconds{endpoint="model",organization_id="org_abc123",statistic="p50"} 1.1
latency_generation_seconds{endpoint="model",organization_id="org_abc123",statistic="p90"} 1.1
latency_generation_seconds{endpoint="model",organization_id="org_abc123",statistic="p95"} 1.1
latency_generation_seconds{endpoint="model",organization_id="org_abc123",statistic="p99"} 1.1
org_abc123)curl -H "Authorization: Bearer $CEREBRAS_API_KEY" \
https://cloud.cerebras.ai/api/v1/metrics/organizations/org_abc123
# HELP inference_endpoint_status Status of inference endpoint (-1=error calculating status, 0=down, 1=up)
# TYPE inference_endpoint_status gauge
inference_endpoint_status{endpoint="model",organization_id="org_abc123"} 1.0
# HELP requests_count_total Total request count (all HTTP codes) in the last complete minute
# TYPE requests_count_total gauge
requests_count_total{endpoint="model",organization_id="org_abc123"} 1.0
# HELP requests_success_total Total successful requests (HTTP 200) in the last complete minute
# TYPE requests_success_total gauge
requests_success_total{endpoint="model",organization_id="org_abc123"} 1.0
# HELP requests_failure_total Total failed requests by HTTP code in the last complete minute
# TYPE requests_failure_total gauge
requests_failure_total{endpoint="model",organization_id="org_abc123"} 0.0
# HELP input_tokens_total Total input tokens for successful requests in the last complete minute
# TYPE input_tokens_total gauge
input_tokens_total{endpoint="model",organization_id="org_abc123"} 123456.0
# HELP output_tokens_total Total output tokens for successful requests in the last complete minute
# TYPE output_tokens_total gauge
output_tokens_total{endpoint="model",organization_id="org_abc123"} 12345.0
# HELP queue_time_seconds Queue time percentiles in seconds
# TYPE queue_time_seconds gauge
queue_time_seconds{endpoint="model",organization_id="org_abc123",percentile="avg"} 0.55
queue_time_seconds{endpoint="model",organization_id="org_abc123",percentile="p50"} 0.55
queue_time_seconds{endpoint="model",organization_id="org_abc123",percentile="p90"} 0.55
queue_time_seconds{endpoint="model",organization_id="org_abc123",percentile="p95"} 0.55
queue_time_seconds{endpoint="model",organization_id="org_abc123",percentile="p99"} 0.55
# HELP e2e_latency_seconds End-to-end API latency percentiles in seconds
# TYPE e2e_latency_seconds gauge
e2e_latency_seconds{endpoint="model",organization_id="org_abc123",statistic="avg"} 0.9
e2e_latency_seconds{endpoint="model",organization_id="org_abc123",statistic="p50"} 0.9
e2e_latency_seconds{endpoint="model",organization_id="org_abc123",statistic="p90"} 0.9
e2e_latency_seconds{endpoint="model",organization_id="org_abc123",statistic="p95"} 0.9
e2e_latency_seconds{endpoint="model",organization_id="org_abc123",statistic="p99"} 0.9
# HELP ttft_seconds Time To First Token percentiles in seconds
# TYPE ttft_seconds gauge
ttft_seconds{endpoint="model",organization_id="org_abc123",statistic="avg"} 0.9
ttft_seconds{endpoint="model",organization_id="org_abc123",statistic="p50"} 0.9
ttft_seconds{endpoint="model",organization_id="org_abc123",statistic="p90"} 0.9
ttft_seconds{endpoint="model",organization_id="org_abc123",statistic="p95"} 0.9
ttft_seconds{endpoint="model",organization_id="org_abc123",statistic="p99"} 0.9
# HELP cache_reads_total Total input tokens read from cache for successful requests in the last complete minute
# TYPE cache_reads_total gauge
cache_reads_total{endpoint="model",organization_id="org_abc123"} 1234.0
# HELP cache_rate Ratio of input tokens read from cache, to total input tokens, for successful requests in the last complete minute
# TYPE cache_rate gauge
cache_rate{endpoint="model",organization_id="org_abc123"} 0.01
# HELP tpot Time Per Output Token (TPOT) percentiles
# TYPE tpot gauge
tpot{endpoint="model",organization_id="org_abc123",statistic="avg"} 0.0001
tpot{endpoint="model",organization_id="org_abc123",statistic="p50"} 0.0001
tpot{endpoint="model",organization_id="org_abc123",statistic="p90"} 0.0001
tpot{endpoint="model",organization_id="org_abc123",statistic="p95"} 0.0001
tpot{endpoint="model",organization_id="org_abc123",statistic="p99"} 0.0001
# HELP latency_generation_seconds Completion time percentiles in seconds
# TYPE latency_generation_seconds gauge
latency_generation_seconds{endpoint="model",organization_id="org_abc123",statistic="avg"} 1.1
latency_generation_seconds{endpoint="model",organization_id="org_abc123",statistic="p50"} 1.1
latency_generation_seconds{endpoint="model",organization_id="org_abc123",statistic="p90"} 1.1
latency_generation_seconds{endpoint="model",organization_id="org_abc123",statistic="p95"} 1.1
latency_generation_seconds{endpoint="model",organization_id="org_abc123",statistic="p99"} 1.1
-1 = Error calculating status0 = Down1 = Uplatency_generation_seconds.Was this page helpful?