diff --git a/tests/apitest/opstest/python/operations/multi_latent_attention/test_multi_latent_attention.py b/tests/apitest/opstest/python/operations/multi_latent_attention/test_multi_latent_attention.py index 3007204666ec58df4f453442ba048129f2eb11ae..b400d476be933930cdeef753b159c0a1c98702fb 100644 --- a/tests/apitest/opstest/python/operations/multi_latent_attention/test_multi_latent_attention.py +++ b/tests/apitest/opstest/python/operations/multi_latent_attention/test_multi_latent_attention.py @@ -37,7 +37,7 @@ class TestPagedAttentionMLA(operation_test.OperationTest): strict_limit_error = torch.maximum(torch.abs(golden * ratios[2]), torch.tensor(ratios[3])) error_count = torch.gt(diff, limit_error).sum().item() strict_error_count = torch.gt(diff, strict_limit_error).sum().item() - print(f"maxDiff {max_diff}") + print(f"maxDiff {max_diff},{self.head_size_qk},{self.max_context_len}") print("1/1000 Accuracy is %f", 1 - float(error_count) / len) print("5/1000 Accuracy is %f", 1 - float(strict_error_count) / len) if self.data_type == torch.bfloat16 or self.is_int8_flag: @@ -241,13 +241,12 @@ class TestPagedAttentionMLA(operation_test.OperationTest): self.head_size_vo = head_size_vo logging.debug(f'input info: {num_tokens}, {num_heads}, {kv_heads}, {head_size_qk}, {head_size_vo}, {block_size}, {num_blocks}, {k_seqlen}, {dtype}') - - query = torch.from_numpy(np.random.uniform(-1.0, 1.0, size=(num_tokens, num_heads, head_size_qk))).to(dtype) + q_range = 5.0 + query = torch.from_numpy(np.random.uniform(-q_range, q_range, size=(num_tokens, num_heads, head_size_qk))).to(dtype) # (num_blocks, block_size, num_heads, head_size) - kv_range = 1.0 + kv_range = 5.0 kv_type = dtype if is_int8_flag: - kv_range = 4.0 kv_type = torch.int8 if not compressHead: key_cache = torch.from_numpy(np.random.uniform(-kv_range, kv_range, size=(num_blocks, block_size, kv_heads, head_size_qk))).to(kv_type) @@ -306,6 +305,7 @@ class TestPagedAttentionMLA(operation_test.OperationTest): block_tables.append(block_table) self.is_int8_flag = is_int8_flag + head_size = head_size_qk if is_int8_flag: de_scale1_fp32 = np.random.randint(-1, 2, size=(kv_heads * head_size)).astype(np.float32) de_scale1_int64 = self.process_deq_scale(de_scale1_fp32) diff --git a/tests/apitest/opstest/python/operations/multi_latent_attention/test_multi_latent_attention_mtp.py b/tests/apitest/opstest/python/operations/multi_latent_attention/test_multi_latent_attention_mtp.py index ffe18d3fc588931c112cc5443503ef958dca78a2..bfcd32e91fbc46a74b0f168a8ec05c942723f1f3 100644 --- a/tests/apitest/opstest/python/operations/multi_latent_attention/test_multi_latent_attention_mtp.py +++ b/tests/apitest/opstest/python/operations/multi_latent_attention/test_multi_latent_attention_mtp.py @@ -37,9 +37,9 @@ class TestPagedAttentionMLA(operation_test.OperationTest): strict_limit_error = torch.maximum(torch.abs(golden * ratios[2]), torch.tensor(ratios[3])) error_count = torch.gt(diff, limit_error).sum().item() strict_error_count = torch.gt(diff, strict_limit_error).sum().item() - print(f"maxDiff {max_diff}") - print("1/1000 Accuracy is %f", 1 - float(error_count) / len) - print("5/1000 Accuracy is %f", 1 - float(strict_error_count) / len) + logging.info(f"maxDiff {max_diff},{self.head_size_qk},{self.max_context_len}") + logging.info("1/1000 Accuracy is %f", 1 - float(error_count) / len) + logging.info("5/1000 Accuracy is %f", 1 - float(strict_error_count) / len) if self.data_type == torch.bfloat16: print("accuracy is correct in old standard: %r", (float(strict_error_count) / len) <= ratios[2]) else: @@ -184,16 +184,20 @@ class TestPagedAttentionMLA(operation_test.OperationTest): q_seqlen_list: int, k_seqlen_list: int, mask_type, - dtype = torch.float16, - calcType=0 + dtype = torch.bfloat16, mask_data_type = torch.bfloat16, + calc_type = 0, + dynamic_batch = False, dynamic_seqlen = None, is_int8_flag = False, has_bias = False, + compressHead = False, is_kv_combined = True, is_nz_in = False, is_quant_flag = False ): self.data_type = dtype + logging.info(f'------data_type:{self.data_type}-----------') self.head_size_qk = head_size_qk - self.calcType = calcType - q_min_range = -1.0 - q_max_range = 1.0 - kv_min_range = -1.0 - kv_max_range = 1.0 + self.calc_type = calc_type + self.is_quant_flag = is_quant_flag + q_min_range = -5.0 + q_max_range = 5.0 + kv_min_range = -5.0 + kv_max_range = 5.0 num_tokens = np.array(q_seqlen_list).sum() batch_size = len(q_seqlen_list) self.max_context_len = max(k_seqlen_list) @@ -216,7 +220,11 @@ class TestPagedAttentionMLA(operation_test.OperationTest): self.pre_mask_factor = -10000.0 self.post_mask_factor = 1.0 - + if self.data_type == torch.bfloat16: + self.pre_mask_factor = 1.0 + self.post_mask_factor = -10000.0 + logging.info(f'------pre_mask_factor:{self.pre_mask_factor}-------') + logging.info(f'------post_mask_factor:{self.post_mask_factor}-------') if mask_type == 1: mask = np.ones(shape=(max_k_seqlen, max_k_seqlen)).astype(np.float16) mask = np.triu(mask, 1) @@ -238,30 +246,28 @@ class TestPagedAttentionMLA(operation_test.OperationTest): elif mask_type == 0: mask = None - logging.info(f'input info: {num_tokens}, {num_heads}, {kv_heads}, {head_size_qk}, {head_size_vo}, {block_size}, {num_blocks}') + logging.info(f'input info: num_tokens {num_tokens}, num_heads {num_heads}, kv_heads {kv_heads}') + logging.info(f'head_size_qk {head_size_qk}, head_size_vo {head_size_vo}, block_size {block_size}, num_blocks {num_blocks}') shape_out = (num_tokens, num_heads, head_size_vo) - ref_output = torch.zeros(shape_out, dtype=dtype) + ref_output = torch.zeros(shape_out, dtype = mask_data_type) true_out = torch.zeros(shape_out, dtype=torch.float32) lse = torch.zeros((num_tokens, num_heads, 1), dtype=dtype) true_lse = torch.zeros((num_tokens, num_heads, 1), dtype=torch.float32) self.ref_single_query_cached_kv_attention( - ref_output, + ref_output, # 6, 128, 512 true_out, lse, true_lse, - query, - key_cache, - value_cache, + query, # 6, 128, 576 + key_cache, # 1024, 128, 1, 576 + value_cache, # [1024, 128, 1, 512]) block_tables, q_seqlen_list, k_seqlen_list, - mask, + mask, # 6, 127 mask_type ) - self.q_split1, self.q_split2 = torch.split(query, [512, 64], dim=2) - self.key_cache_split1, self.key_cache_split2 = torch.split(key_cache, [512, 64], dim=3) - self.q = query self.num_tokens = num_tokens self.key_cache = key_cache @@ -272,6 +278,11 @@ class TestPagedAttentionMLA(operation_test.OperationTest): self.mask = mask self.golden_out = ref_output self.true_out = true_out + context_lens = [k_seqlen_list] * num_tokens + context_lens = [val for val in context_lens for _ in range(kv_heads)] + self.contex_lens = np.array(context_lens).astype(np.int32) + self.q_split1, self.q_split2 = torch.split(query, [512, 64], dim=2) + self.key_cache_split1, self.key_cache_split2 = torch.split(key_cache, [512, 64], dim=3) self.lse = lse self.true_lse = true_lse @@ -296,7 +307,7 @@ class TestPagedAttentionMLA(operation_test.OperationTest): def golden_calc(self, in_tensors): golden_out = torch.tensor(self.golden_out) result = [golden_out] - if self.calcType == 3: + if self.calc_type == 3: result.append(self.lse) return result @@ -434,7 +445,7 @@ class TestPagedAttentionMLA(operation_test.OperationTest): mask_type = 3 self.calc_data(num_heads, kv_heads, num_blocks, block_size, head_size_qk, head_size_vo, - q_seqlen, k_seqlen, mask_type, dtype, 3) + q_seqlen, k_seqlen, mask_type, dtype, mask_data_type=dtype, calc_type=3) OP_NAME = "MultiLatentAttentionOperation" PARAM = json.dumps({"headNum": num_heads, "qkScale":tor, "kvHeadNum":kv_heads, "calcType": 3, "maskType": 1, "cacheMode": 1}) @@ -502,14 +513,14 @@ class TestPagedAttentionMLA(operation_test.OperationTest): num_blocks = 64 k_seqlen = [127] * batch tor = 1.0 / (head_size_qk ** 0.5) - dtype = torch.float16 + dtype = torch.bfloat16 mask_type = 3 - + calc_type = 1 self.calc_data(num_heads, kv_heads, num_blocks, block_size, head_size_qk, head_size_vo, - q_seqlen, k_seqlen, mask_type, dtype) + q_seqlen, k_seqlen, mask_type, dtype, calc_type=calc_type) mask_free = self.gen_mask(q_seqlen[0], dtype) OP_NAME = "MultiLatentAttentionOperation" - PARAM = json.dumps({"headNum": num_heads, "qkScale":tor, "kvHeadNum":kv_heads, "calcType": 1, "maskType": 2, "cacheMode": 1}) + PARAM = json.dumps({"headNum": num_heads, "qkScale":tor, "kvHeadNum":kv_heads, "calcType": calc_type, "maskType": 2, "cacheMode": 1}) RUN_PARAM = json.dumps({"contextLens": self.k_seqlen_list.tolist(), "qSeqlen": self.q_seqlen_list.tolist(), "maskType": 2}) self.execute_with_param(OP_NAME, PARAM, RUN_PARAM, [self.q_split1.npu(), diff --git a/tests/apitest/opstest/python/operations/multi_latent_attention/test_multi_latent_attention_prefill.py b/tests/apitest/opstest/python/operations/multi_latent_attention/test_multi_latent_attention_prefill.py index d3faf0157e064dfb534707ae97ced1ec9af76d64..5bd9af4a477ead7e036260a27e6ed48f526449fc 100644 --- a/tests/apitest/opstest/python/operations/multi_latent_attention/test_multi_latent_attention_prefill.py +++ b/tests/apitest/opstest/python/operations/multi_latent_attention/test_multi_latent_attention_prefill.py @@ -123,20 +123,20 @@ class TestMLAPrefill(operation_test.OperationTest): self.layer_id = torch.from_numpy(np.array([0], dtype=np.int32)).to(torch.int32) self.q_max_seq = np.max(self.q_seqlen) self.kv_max_seq = np.max(self.kv_seqlen) - q = torch.from_numpy(np.random.uniform(-1.0, 1.0, size=(self.q_ntokens, heads * self.embeddim))) + q = torch.from_numpy(np.random.uniform(-5.0, 5.0, size=(self.q_ntokens, heads * self.embeddim))) self.q = q.to(data_type) if num_blocks is None: - self.k = torch.from_numpy(np.random.uniform(-1.0, 1.0, size=(self.layer_id[0] + 1, batch, self.max_seq, kv_head * self.embeddim))).to(data_type) - self.v = torch.from_numpy(np.random.uniform(-1.0, 1.0, size=(self.layer_id[0] + 1, batch, self.max_seq, kv_head * self.embeddimv))).to(data_type) + self.k = torch.from_numpy(np.random.uniform(-5.0, 5.0, size=(self.layer_id[0] + 1, batch, self.max_seq, kv_head * self.embeddim))).to(data_type) + self.v = torch.from_numpy(np.random.uniform(-5.0, 5.0, size=(self.layer_id[0] + 1, batch, self.max_seq, kv_head * self.embeddimv))).to(data_type) if is_splitm: maxKvSeqlen = max(self.kv_seqlen) - self.k = torch.from_numpy(np.random.uniform(-1.0, 1.0, size=(self.layer_id[0] + 1, batch, maxKvSeqlen, kv_head * self.embeddim))).to(data_type) - self.v = torch.from_numpy(np.random.uniform(-1.0, 1.0, size=(self.layer_id[0] + 1, batch, maxKvSeqlen, kv_head * self.embeddimv))).to(data_type) + self.k = torch.from_numpy(np.random.uniform(-5.0, 5.0, size=(self.layer_id[0] + 1, batch, maxKvSeqlen, kv_head * self.embeddim))).to(data_type) + self.v = torch.from_numpy(np.random.uniform(-5.0, 5.0, size=(self.layer_id[0] + 1, batch, maxKvSeqlen, kv_head * self.embeddimv))).to(data_type) else: # kv cache shape: (num_blocks, block_size, num_heads, head_size) - self.k_cache = torch.from_numpy(np.random.uniform(-1.0, 1.0, size=(num_blocks, block_size, kv_head, embeddim))).to(data_type) - self.v_cache = torch.from_numpy(np.random.uniform(-1.0, 1.0, size=(num_blocks, block_size, kv_head, embeddim))).to(data_type) + self.k_cache = torch.from_numpy(np.random.uniform(-5.0, 5.0, size=(num_blocks, block_size, kv_head, embeddim))).to(data_type) + self.v_cache = torch.from_numpy(np.random.uniform(-5.0, 5.0, size=(num_blocks, block_size, kv_head, embeddim))).to(data_type) batch = len(kv_seqLen) max_context_len = max(kv_seqLen) diff --git a/tests/apitest/opstest/python/operations/multi_latent_attention/test_multi_latent_attention_ring.py b/tests/apitest/opstest/python/operations/multi_latent_attention/test_multi_latent_attention_ring.py index 7297b32b0c85b1429281c00b15192070250f4da6..ff016659f49dfe462e36899c04afa32312d14770 100644 --- a/tests/apitest/opstest/python/operations/multi_latent_attention/test_multi_latent_attention_ring.py +++ b/tests/apitest/opstest/python/operations/multi_latent_attention/test_multi_latent_attention_ring.py @@ -260,9 +260,10 @@ class TestPagedAttentionMLA(operation_test.OperationTest): logging.debug( f'input info: {num_tokens}, {num_heads}, {kv_heads}, {head_size_qk}, {head_size_vo}, {block_size}, {num_blocks}, {k_seqlen}, {dtype}') - query = torch.from_numpy(np.random.uniform(-1.0, 1.0, size=(num_tokens, num_heads, head_size_qk))).to(dtype) + q_range = 5.0 + query = torch.from_numpy(np.random.uniform(-q_range, q_range, size=(num_tokens, num_heads, head_size_qk))).to(dtype) # (num_blocks, block_size, num_heads, head_size) - kv_range = 1.0 + kv_range = 5.0 kv_type = dtype if is_int8_flag: kv_range = 4.0