[Doc]Add instruction for profiling with bench_one_batch (#5581)

Fridge003 · web-flow · commit b54b5a96e4e9 · 2025-04-20T14:05:36.000-07:00
diff --git a/docs/references/benchmark_and_profiling.md b/docs/references/benchmark_and_profiling.md
@@ -41,9 +41,14 @@
   Please make sure that the `SGLANG_TORCH_PROFILER_DIR` should be set at both server and client side, otherwise the trace file cannot be generated correctly . A secure way will be setting `SGLANG_TORCH_PROFILER_DIR` in the `.*rc` file of shell (e.g. `~/.bashrc` for bash shells).
 
 - To profile offline
-
   ```bash
   export SGLANG_TORCH_PROFILER_DIR=/root/sglang/profile_log
+
+  # profile one batch with bench_one_batch.py
+  # batch size can be controlled with --batch argument
+  python3 -m sglang.bench_one_batch --model-path meta-llama/Llama-3.1-8B-Instruct --batch 32 --input-len 1024 --output-len 10 --profile
+
+  # profile multiple batches with bench_offline_throughput.py
   python -m sglang.bench_offline_throughput --model-path meta-llama/Llama-3.1-8B-Instruct --dataset-name random --num-prompts 10 --profile --mem-frac=0.8
   ```
 
diff --git a/python/sglang/bench_one_batch.py b/python/sglang/bench_one_batch.py
@@ -396,7 +396,7 @@ def latency_test_run_once(
         decode_latencies.append(latency)
         if i < 5:
             rank_print(
-                f"Decode.  latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
+                f"Decode. Batch size: {batch_size}, latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
             )
 
     if profile:

Original file line number	Diff line number	Diff line change
`@@ -396,7 +396,7 @@ def latency_test_run_once(`
`396`	`396`	`decode_latencies.append(latency)`
`397`	`397`	`if i < 5:`
`398`	`398`	`rank_print(`
`399`		`- f"Decode. latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"`
	`399`	`+ f"Decode. Batch size: {batch_size}, latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"`
`400`	`400`	`)`
`401`	`401`
`402`	`402`	`if profile:`