diff --git a/ais_bench/benchmark/calculators/default_perf_metric_calculator.py b/ais_bench/benchmark/calculators/default_perf_metric_calculator.py index a9a78642dfb52aefd0acc49356cda165069ac8af..5a9288228b27f65ed49e3c2cb500b2ad2eca12d7 100644 --- a/ais_bench/benchmark/calculators/default_perf_metric_calculator.py +++ b/ais_bench/benchmark/calculators/default_perf_metric_calculator.py @@ -63,11 +63,14 @@ class DefaultPerfMetricCalculator(BasePerfMetricCalculator): self.infer_time[stage_name] = max(result["end_time"]) - min(result["start_time"]) per_request_avg_decode_time = [] # Compute the average decode latency per request - for i, value in enumerate(result["seq_latency"]): - if value: # Skip empty lists - tpot = (value - result["prefill_latency"][i]) / result["generate_tokens_len"][i] - per_request_avg_decode_time.append(tpot) - result["average_decode_latencies"] = per_request_avg_decode_time[:] + if not math.isclose(sum(result["prefill_latency"]), 0): + for i, value in enumerate(result["seq_latency"]): + if value: # Skip empty lists + tpot = (value - result["prefill_latency"][i]) / result["generate_tokens_len"][i] + per_request_avg_decode_time.append(tpot) + result["average_decode_latencies"] = per_request_avg_decode_time[:] + else: + result["average_decode_latencies"] = result["prefill_latency"] self.logger.info("Converting perf results of stage ...") self.result[stage_name] = self.convert_result(copy.deepcopy(result)) diff --git a/ais_bench/benchmark/calculators/stable_perf_metric_calculator.py b/ais_bench/benchmark/calculators/stable_perf_metric_calculator.py index 50ab6089ac0009b5fac4af5a1be5aac05e7d78df..ff91bb56a54a6596ccc21a8b6ae4c7cb13b5d196 100644 --- a/ais_bench/benchmark/calculators/stable_perf_metric_calculator.py +++ b/ais_bench/benchmark/calculators/stable_perf_metric_calculator.py @@ -107,10 +107,14 @@ class StablePerfMetricCalculator(BasePerfMetricCalculator): self.infer_time[stage_name] = self.stage_section[1] - self.stage_section[0] per_request_avg_decode_time = [] # Compute the average decode latency per request - for i, value in enumerate(result["seq_latency"]): - if value: # Skip empty lists - tpot = (value - result["prefill_latency"][i]) / result["generate_tokens_len"][i] - per_request_avg_decode_time.append(tpot) + if not math.isclose(sum(result["prefill_latency"]), 0): + for i, value in enumerate(result["seq_latency"]): + if value: # Skip empty lists + tpot = (value - result["prefill_latency"][i]) / result["generate_tokens_len"][i] + per_request_avg_decode_time.append(tpot) + result["average_decode_latencies"] = per_request_avg_decode_time[:] + else: + result["average_decode_latencies"] = result["prefill_latency"] result["average_decode_latencies"] = per_request_avg_decode_time[:] self.logger.info("Converting perf results of stage ...") self.result[stage_name] = self.convert_result(copy.deepcopy(result)) diff --git a/ais_bench/benchmark/clients/openai_chat_text_client.py b/ais_bench/benchmark/clients/openai_chat_text_client.py index ccd1e17d32e0791992c9de8260251c28b2593c62..08fb01d6a4b45ca22923caff961e76d493a52824 100644 --- a/ais_bench/benchmark/clients/openai_chat_text_client.py +++ b/ais_bench/benchmark/clients/openai_chat_text_client.py @@ -25,4 +25,9 @@ class OpenAIChatTextClient(BaseClient, ABC): if generated_text: inputs.output = generated_text inputs.num_generated_chars = len(generated_text) + inputs.prefill_latency = res.get("prefill_time", 0) + inputs.chunk_time_point_list.append(inputs.chunk_time_point_list[0] + inputs.prefill_latency) + inputs.decode_cost = res.get("decode_time_arr", []) + inputs.chunk_time_point_list.append(inputs.chunk_time_point_list[1] + sum(inputs.decode_cost)) + return generated_text