diff --git a/tests/kernels/quantization/golden/hybrid_w4a16_gfx1151.json b/tests/kernels/quantization/golden/hybrid_w4a16_gfx1151.json index 9a0bef551c28..94264070f62f 100644 --- a/tests/kernels/quantization/golden/hybrid_w4a16_gfx1151.json +++ b/tests/kernels/quantization/golden/hybrid_w4a16_gfx1151.json @@ -13,67 +13,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 1.4128 + "tflops": 0.6721 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.808 + "tflops": 1.1036 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 1.7654 + "tflops": 1.7503 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 3.4264 + "tflops": 2.1418 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 6.7844 + "tflops": 4.2501 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 12.8477 + "tflops": 7.5174 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 12.6978 + "tflops": 9.8029 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 18.4082 + "tflops": 10.7396 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 15.6968 + "tflops": 17.9939 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 16.3975 + "tflops": 22.7349 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 16.6954 + "tflops": 23.1121 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 21.4047 + "tflops": 27.3183 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 21.6875 + "tflops": 28.7668 } ] }, @@ -83,67 +83,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 1.2419 + "tflops": 0.6399 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.7243 + "tflops": 1.0612 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 1.7317 + "tflops": 1.7132 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 3.4744 + "tflops": 2.1241 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 6.8001 + "tflops": 4.3826 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 12.9224 + "tflops": 7.4463 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 12.1375 + "tflops": 11.1026 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 19.0644 + "tflops": 10.8838 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 15.5782 + "tflops": 17.9214 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 16.2083 + "tflops": 22.6665 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 16.3604 + "tflops": 22.9522 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 20.9934 + "tflops": 24.2647 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 21.2141 + "tflops": 25.7082 } ] }, @@ -153,67 +153,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 1.3524 + "tflops": 0.6712 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.7727 + "tflops": 1.0964 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 1.7153 + "tflops": 1.7156 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 3.3279 + "tflops": 1.7903 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 6.537 + "tflops": 3.6947 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 12.6312 + "tflops": 7.2291 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 12.9988 + "tflops": 9.0963 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 19.7027 + "tflops": 16.2777 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 18.2978 + "tflops": 15.2409 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 19.0515 + "tflops": 16.9796 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 18.7332 + "tflops": 17.4721 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 23.7608 + "tflops": 24.4999 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 23.8789 + "tflops": 24.4845 } ] }, @@ -223,67 +223,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 1.2658 + "tflops": 0.6547 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.7165 + "tflops": 1.07 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 1.6581 + "tflops": 1.6744 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 3.0787 + "tflops": 1.8969 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 6.1021 + "tflops": 3.657 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 11.821 + "tflops": 7.2317 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 12.8133 + "tflops": 9.087 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 18.5013 + "tflops": 17.3451 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 17.0859 + "tflops": 14.4696 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 18.1149 + "tflops": 16.4064 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 18.0408 + "tflops": 16.6163 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 23.729 + "tflops": 23.8826 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 23.7454 + "tflops": 24.1585 } ] } @@ -301,67 +301,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 1.2864 + "tflops": 0.719 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.7909 + "tflops": 1.2034 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 1.8736 + "tflops": 1.8764 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 3.8731 + "tflops": 1.9336 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 7.5097 + "tflops": 3.8306 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 14.3792 + "tflops": 7.0488 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 13.0927 + "tflops": 10.1889 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 21.705 + "tflops": 11.5428 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 16.4913 + "tflops": 19.4123 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 16.6033 + "tflops": 23.5661 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 17.6707 + "tflops": 25.6054 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 23.0623 + "tflops": 27.7113 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 23.4599 + "tflops": 28.813 } ] }, @@ -371,67 +371,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 1.1639 + "tflops": 0.6914 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.7231 + "tflops": 1.1317 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 1.8702 + "tflops": 1.8516 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 3.6878 + "tflops": 1.9149 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 7.222 + "tflops": 3.7832 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 13.4316 + "tflops": 6.921 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 12.8495 + "tflops": 10.0228 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 18.4533 + "tflops": 11.3722 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 16.1602 + "tflops": 19.109 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 16.5573 + "tflops": 23.3777 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 17.2919 + "tflops": 25.8591 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 22.668 + "tflops": 26.7757 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 23.3596 + "tflops": 27.0458 } ] }, @@ -441,67 +441,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 1.2682 + "tflops": 0.7112 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.7146 + "tflops": 1.187 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 1.8464 + "tflops": 1.8553 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 3.5959 + "tflops": 1.7449 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 6.9594 + "tflops": 3.5004 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 13.3677 + "tflops": 7.233 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 14.0524 + "tflops": 8.5988 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 22.9062 + "tflops": 16.7872 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 18.3427 + "tflops": 15.0835 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 18.1713 + "tflops": 15.9189 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 19.3614 + "tflops": 17.6731 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 24.3312 + "tflops": 24.877 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 24.5709 + "tflops": 24.73 } ] }, @@ -511,67 +511,643 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 1.1972 + "tflops": 0.6977 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.7096 + "tflops": 1.1316 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 1.8188 + "tflops": 1.8029 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 3.4401 + "tflops": 1.8482 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 6.7777 + "tflops": 3.6417 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 13.0765 + "tflops": 7.2745 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 13.7838 + "tflops": 8.8788 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 20.377 + "tflops": 16.5067 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 17.4448 + "tflops": 14.3658 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 17.6678 + "tflops": 15.5999 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 18.9946 + "tflops": 17.0667 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 23.791 + "tflops": 24.8951 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 24.435 + "tflops": 24.0047 + } + ] + } + ] + }, + { + "in_features": 2048, + "out_features": 4096, + "group_size": 128, + "comment": "Qwen3-1.7B qkv_proj", + "providers": [ + { + "provider": "hybrid-w4a16", + "baselines": [ + { + "batch_size": 1, + "kernel": "wvsplitk_int4", + "tflops": 0.7712 + }, + { + "batch_size": 2, + "kernel": "wvsplitk_int4", + "tflops": 1.3511 + }, + { + "batch_size": 4, + "kernel": "wvsplitk_int4", + "tflops": 2.3393 + }, + { + "batch_size": 8, + "kernel": "hybrid_triton_w4a16", + "tflops": 1.9539 + }, + { + "batch_size": 16, + "kernel": "hybrid_triton_w4a16", + "tflops": 3.7446 + }, + { + "batch_size": 32, + "kernel": "hybrid_triton_w4a16", + "tflops": 7.0874 + }, + { + "batch_size": 64, + "kernel": "hybrid_triton_w4a16", + "tflops": 10.4861 + }, + { + "batch_size": 128, + "kernel": "hybrid_triton_w4a16", + "tflops": 12.2436 + }, + { + "batch_size": 256, + "kernel": "hybrid_triton_w4a16", + "tflops": 20.3996 + }, + { + "batch_size": 512, + "kernel": "hybrid_triton_w4a16", + "tflops": 20.2568 + }, + { + "batch_size": 1024, + "kernel": "hybrid_triton_w4a16", + "tflops": 25.9767 + }, + { + "batch_size": 2048, + "kernel": "hybrid_triton_w4a16", + "tflops": 28.2727 + }, + { + "batch_size": 4096, + "kernel": "hybrid_triton_w4a16", + "tflops": 27.5906 + } + ] + }, + { + "provider": "hybrid-w4a16-zp", + "baselines": [ + { + "batch_size": 1, + "kernel": "wvsplitk_int4", + "tflops": 0.7362 + }, + { + "batch_size": 2, + "kernel": "wvsplitk_int4", + "tflops": 1.2908 + }, + { + "batch_size": 4, + "kernel": "wvsplitk_int4", + "tflops": 2.1995 + }, + { + "batch_size": 8, + "kernel": "hybrid_triton_w4a16", + "tflops": 1.8172 + }, + { + "batch_size": 16, + "kernel": "hybrid_triton_w4a16", + "tflops": 3.6789 + }, + { + "batch_size": 32, + "kernel": "hybrid_triton_w4a16", + "tflops": 7.1657 + }, + { + "batch_size": 64, + "kernel": "hybrid_triton_w4a16", + "tflops": 9.9838 + }, + { + "batch_size": 128, + "kernel": "hybrid_triton_w4a16", + "tflops": 11.4469 + }, + { + "batch_size": 256, + "kernel": "hybrid_triton_w4a16", + "tflops": 19.3413 + }, + { + "batch_size": 512, + "kernel": "hybrid_triton_w4a16", + "tflops": 22.1439 + }, + { + "batch_size": 1024, + "kernel": "hybrid_triton_w4a16", + "tflops": 24.2663 + }, + { + "batch_size": 2048, + "kernel": "hybrid_triton_w4a16", + "tflops": 26.7024 + }, + { + "batch_size": 4096, + "kernel": "hybrid_triton_w4a16", + "tflops": 26.02 + } + ] + }, + { + "provider": "hybrid-w4a16-bf16", + "baselines": [ + { + "batch_size": 1, + "kernel": "wvsplitk_int4", + "tflops": 0.7876 + }, + { + "batch_size": 2, + "kernel": "wvsplitk_int4", + "tflops": 1.3532 + }, + { + "batch_size": 4, + "kernel": "wvsplitk_int4", + "tflops": 2.3727 + }, + { + "batch_size": 8, + "kernel": "hybrid_triton_w4a16", + "tflops": 1.8373 + }, + { + "batch_size": 16, + "kernel": "hybrid_triton_w4a16", + "tflops": 3.6874 + }, + { + "batch_size": 32, + "kernel": "hybrid_triton_w4a16", + "tflops": 7.2804 + }, + { + "batch_size": 64, + "kernel": "hybrid_triton_w4a16", + "tflops": 9.1723 + }, + { + "batch_size": 128, + "kernel": "hybrid_triton_w4a16", + "tflops": 18.7409 + }, + { + "batch_size": 256, + "kernel": "hybrid_triton_w4a16", + "tflops": 15.3638 + }, + { + "batch_size": 512, + "kernel": "hybrid_triton_w4a16", + "tflops": 16.9482 + }, + { + "batch_size": 1024, + "kernel": "hybrid_triton_w4a16", + "tflops": 18.4613 + }, + { + "batch_size": 2048, + "kernel": "hybrid_triton_w4a16", + "tflops": 25.2814 + }, + { + "batch_size": 4096, + "kernel": "hybrid_triton_w4a16", + "tflops": 21.5325 + } + ] + }, + { + "provider": "hybrid-w4a16-zp-bf16", + "baselines": [ + { + "batch_size": 1, + "kernel": "wvsplitk_int4", + "tflops": 0.7549 + }, + { + "batch_size": 2, + "kernel": "wvsplitk_int4", + "tflops": 1.3094 + }, + { + "batch_size": 4, + "kernel": "wvsplitk_int4", + "tflops": 2.2304 + }, + { + "batch_size": 8, + "kernel": "hybrid_triton_w4a16", + "tflops": 1.8426 + }, + { + "batch_size": 16, + "kernel": "hybrid_triton_w4a16", + "tflops": 3.6665 + }, + { + "batch_size": 32, + "kernel": "hybrid_triton_w4a16", + "tflops": 7.255 + }, + { + "batch_size": 64, + "kernel": "hybrid_triton_w4a16", + "tflops": 8.0271 + }, + { + "batch_size": 128, + "kernel": "hybrid_triton_w4a16", + "tflops": 17.1986 + }, + { + "batch_size": 256, + "kernel": "hybrid_triton_w4a16", + "tflops": 13.7228 + }, + { + "batch_size": 512, + "kernel": "hybrid_triton_w4a16", + "tflops": 16.6984 + }, + { + "batch_size": 1024, + "kernel": "hybrid_triton_w4a16", + "tflops": 17.7128 + }, + { + "batch_size": 2048, + "kernel": "hybrid_triton_w4a16", + "tflops": 23.8339 + }, + { + "batch_size": 4096, + "kernel": "hybrid_triton_w4a16", + "tflops": 21.5273 + } + ] + } + ] + }, + { + "in_features": 2048, + "out_features": 12288, + "group_size": 128, + "comment": "Qwen3-1.7B gate_up_proj", + "providers": [ + { + "provider": "hybrid-w4a16", + "baselines": [ + { + "batch_size": 1, + "kernel": "wvsplitk_int4", + "tflops": 0.879 + }, + { + "batch_size": 2, + "kernel": "wvsplitk_int4", + "tflops": 1.5937 + }, + { + "batch_size": 4, + "kernel": "wvsplitk_int4", + "tflops": 2.756 + }, + { + "batch_size": 8, + "kernel": "hybrid_triton_w4a16", + "tflops": 1.8483 + }, + { + "batch_size": 16, + "kernel": "hybrid_triton_w4a16", + "tflops": 3.7156 + }, + { + "batch_size": 32, + "kernel": "hybrid_triton_w4a16", + "tflops": 7.2747 + }, + { + "batch_size": 64, + "kernel": "hybrid_triton_w4a16", + "tflops": 15.2105 + }, + { + "batch_size": 128, + "kernel": "hybrid_triton_w4a16", + "tflops": 9.0646 + }, + { + "batch_size": 256, + "kernel": "hybrid_triton_w4a16", + "tflops": 19.5289 + }, + { + "batch_size": 512, + "kernel": "hybrid_triton_w4a16", + "tflops": 24.5725 + }, + { + "batch_size": 1024, + "kernel": "hybrid_triton_w4a16", + "tflops": 28.7206 + }, + { + "batch_size": 2048, + "kernel": "hybrid_triton_w4a16", + "tflops": 29.5579 + }, + { + "batch_size": 4096, + "kernel": "hybrid_triton_w4a16", + "tflops": 29.5907 + } + ] + }, + { + "provider": "hybrid-w4a16-zp", + "baselines": [ + { + "batch_size": 1, + "kernel": "wvsplitk_int4", + "tflops": 0.8752 + }, + { + "batch_size": 2, + "kernel": "wvsplitk_int4", + "tflops": 1.577 + }, + { + "batch_size": 4, + "kernel": "wvsplitk_int4", + "tflops": 2.7324 + }, + { + "batch_size": 8, + "kernel": "hybrid_triton_w4a16", + "tflops": 1.7323 + }, + { + "batch_size": 16, + "kernel": "hybrid_triton_w4a16", + "tflops": 3.3803 + }, + { + "batch_size": 32, + "kernel": "hybrid_triton_w4a16", + "tflops": 6.9878 + }, + { + "batch_size": 64, + "kernel": "hybrid_triton_w4a16", + "tflops": 14.8802 + }, + { + "batch_size": 128, + "kernel": "hybrid_triton_w4a16", + "tflops": 8.9848 + }, + { + "batch_size": 256, + "kernel": "hybrid_triton_w4a16", + "tflops": 17.3786 + }, + { + "batch_size": 512, + "kernel": "hybrid_triton_w4a16", + "tflops": 25.5645 + }, + { + "batch_size": 1024, + "kernel": "hybrid_triton_w4a16", + "tflops": 25.9758 + }, + { + "batch_size": 2048, + "kernel": "hybrid_triton_w4a16", + "tflops": 27.1691 + }, + { + "batch_size": 4096, + "kernel": "hybrid_triton_w4a16", + "tflops": 27.3503 + } + ] + }, + { + "provider": "hybrid-w4a16-bf16", + "baselines": [ + { + "batch_size": 1, + "kernel": "wvsplitk_int4", + "tflops": 0.9022 + }, + { + "batch_size": 2, + "kernel": "wvsplitk_int4", + "tflops": 1.6344 + }, + { + "batch_size": 4, + "kernel": "wvsplitk_int4", + "tflops": 2.7576 + }, + { + "batch_size": 8, + "kernel": "hybrid_triton_w4a16", + "tflops": 1.8446 + }, + { + "batch_size": 16, + "kernel": "hybrid_triton_w4a16", + "tflops": 3.6317 + }, + { + "batch_size": 32, + "kernel": "hybrid_triton_w4a16", + "tflops": 7.1866 + }, + { + "batch_size": 64, + "kernel": "hybrid_triton_w4a16", + "tflops": 6.8664 + }, + { + "batch_size": 128, + "kernel": "hybrid_triton_w4a16", + "tflops": 16.2357 + }, + { + "batch_size": 256, + "kernel": "hybrid_triton_w4a16", + "tflops": 25.8467 + }, + { + "batch_size": 512, + "kernel": "hybrid_triton_w4a16", + "tflops": 26.1517 + }, + { + "batch_size": 1024, + "kernel": "hybrid_triton_w4a16", + "tflops": 25.3962 + }, + { + "batch_size": 2048, + "kernel": "hybrid_triton_w4a16", + "tflops": 24.3667 + }, + { + "batch_size": 4096, + "kernel": "hybrid_triton_w4a16", + "tflops": 21.8953 + } + ] + }, + { + "provider": "hybrid-w4a16-zp-bf16", + "baselines": [ + { + "batch_size": 1, + "kernel": "wvsplitk_int4", + "tflops": 0.8841 + }, + { + "batch_size": 2, + "kernel": "wvsplitk_int4", + "tflops": 1.5894 + }, + { + "batch_size": 4, + "kernel": "wvsplitk_int4", + "tflops": 2.6998 + }, + { + "batch_size": 8, + "kernel": "hybrid_triton_w4a16", + "tflops": 1.7724 + }, + { + "batch_size": 16, + "kernel": "hybrid_triton_w4a16", + "tflops": 3.6322 + }, + { + "batch_size": 32, + "kernel": "hybrid_triton_w4a16", + "tflops": 6.9618 + }, + { + "batch_size": 64, + "kernel": "hybrid_triton_w4a16", + "tflops": 6.2006 + }, + { + "batch_size": 128, + "kernel": "hybrid_triton_w4a16", + "tflops": 15.3222 + }, + { + "batch_size": 256, + "kernel": "hybrid_triton_w4a16", + "tflops": 25.62 + }, + { + "batch_size": 512, + "kernel": "hybrid_triton_w4a16", + "tflops": 26.1042 + }, + { + "batch_size": 1024, + "kernel": "hybrid_triton_w4a16", + "tflops": 25.2271 + }, + { + "batch_size": 2048, + "kernel": "hybrid_triton_w4a16", + "tflops": 24.4834 + }, + { + "batch_size": 4096, + "kernel": "hybrid_triton_w4a16", + "tflops": 21.8037 } ] } @@ -581,7 +1157,1159 @@ "in_features": 2048, "out_features": 32768, "group_size": 128, - "comment": "gemma-2b gate_up_proj", + "comment": "gemma-2b gate_up_proj", + "providers": [ + { + "provider": "hybrid-w4a16", + "baselines": [ + { + "batch_size": 1, + "kernel": "wvsplitk_int4", + "tflops": 0.892 + }, + { + "batch_size": 2, + "kernel": "wvsplitk_int4", + "tflops": 1.7224 + }, + { + "batch_size": 4, + "kernel": "wvsplitk_int4", + "tflops": 3.3645 + }, + { + "batch_size": 8, + "kernel": "hybrid_triton_w4a16", + "tflops": 2.0896 + }, + { + "batch_size": 16, + "kernel": "hybrid_triton_w4a16", + "tflops": 3.9622 + }, + { + "batch_size": 32, + "kernel": "hybrid_triton_w4a16", + "tflops": 8.4102 + }, + { + "batch_size": 64, + "kernel": "hybrid_triton_w4a16", + "tflops": 18.8817 + }, + { + "batch_size": 128, + "kernel": "hybrid_triton_w4a16", + "tflops": 14.0716 + }, + { + "batch_size": 256, + "kernel": "hybrid_triton_w4a16", + "tflops": 22.8434 + }, + { + "batch_size": 512, + "kernel": "hybrid_triton_w4a16", + "tflops": 27.9336 + }, + { + "batch_size": 1024, + "kernel": "hybrid_triton_w4a16", + "tflops": 29.7949 + }, + { + "batch_size": 2048, + "kernel": "hybrid_triton_w4a16", + "tflops": 29.3619 + }, + { + "batch_size": 4096, + "kernel": "hybrid_triton_w4a16", + "tflops": 29.6534 + } + ] + }, + { + "provider": "hybrid-w4a16-zp", + "baselines": [ + { + "batch_size": 1, + "kernel": "wvsplitk_int4", + "tflops": 0.8837 + }, + { + "batch_size": 2, + "kernel": "wvsplitk_int4", + "tflops": 1.6684 + }, + { + "batch_size": 4, + "kernel": "wvsplitk_int4", + "tflops": 3.2689 + }, + { + "batch_size": 8, + "kernel": "hybrid_triton_w4a16", + "tflops": 1.7561 + }, + { + "batch_size": 16, + "kernel": "hybrid_triton_w4a16", + "tflops": 3.4446 + }, + { + "batch_size": 32, + "kernel": "hybrid_triton_w4a16", + "tflops": 7.7809 + }, + { + "batch_size": 64, + "kernel": "hybrid_triton_w4a16", + "tflops": 18.9021 + }, + { + "batch_size": 128, + "kernel": "hybrid_triton_w4a16", + "tflops": 11.8286 + }, + { + "batch_size": 256, + "kernel": "hybrid_triton_w4a16", + "tflops": 19.4745 + }, + { + "batch_size": 512, + "kernel": "hybrid_triton_w4a16", + "tflops": 26.1853 + }, + { + "batch_size": 1024, + "kernel": "hybrid_triton_w4a16", + "tflops": 26.5802 + }, + { + "batch_size": 2048, + "kernel": "hybrid_triton_w4a16", + "tflops": 26.0651 + }, + { + "batch_size": 4096, + "kernel": "hybrid_triton_w4a16", + "tflops": 27.286 + } + ] + }, + { + "provider": "hybrid-w4a16-bf16", + "baselines": [ + { + "batch_size": 1, + "kernel": "wvsplitk_int4", + "tflops": 0.9031 + }, + { + "batch_size": 2, + "kernel": "wvsplitk_int4", + "tflops": 1.7037 + }, + { + "batch_size": 4, + "kernel": "wvsplitk_int4", + "tflops": 3.3285 + }, + { + "batch_size": 8, + "kernel": "hybrid_triton_w4a16", + "tflops": 2.47 + }, + { + "batch_size": 16, + "kernel": "hybrid_triton_w4a16", + "tflops": 4.7825 + }, + { + "batch_size": 32, + "kernel": "hybrid_triton_w4a16", + "tflops": 8.9034 + }, + { + "batch_size": 64, + "kernel": "hybrid_triton_w4a16", + "tflops": 6.3789 + }, + { + "batch_size": 128, + "kernel": "hybrid_triton_w4a16", + "tflops": 19.9048 + }, + { + "batch_size": 256, + "kernel": "hybrid_triton_w4a16", + "tflops": 25.2352 + }, + { + "batch_size": 512, + "kernel": "hybrid_triton_w4a16", + "tflops": 26.7701 + }, + { + "batch_size": 1024, + "kernel": "hybrid_triton_w4a16", + "tflops": 25.5549 + }, + { + "batch_size": 2048, + "kernel": "hybrid_triton_w4a16", + "tflops": 24.0162 + }, + { + "batch_size": 4096, + "kernel": "hybrid_triton_w4a16", + "tflops": 21.1146 + } + ] + }, + { + "provider": "hybrid-w4a16-zp-bf16", + "baselines": [ + { + "batch_size": 1, + "kernel": "wvsplitk_int4", + "tflops": 0.8947 + }, + { + "batch_size": 2, + "kernel": "wvsplitk_int4", + "tflops": 1.6781 + }, + { + "batch_size": 4, + "kernel": "wvsplitk_int4", + "tflops": 3.1722 + }, + { + "batch_size": 8, + "kernel": "hybrid_triton_w4a16", + "tflops": 2.4506 + }, + { + "batch_size": 16, + "kernel": "hybrid_triton_w4a16", + "tflops": 4.8341 + }, + { + "batch_size": 32, + "kernel": "hybrid_triton_w4a16", + "tflops": 8.9714 + }, + { + "batch_size": 64, + "kernel": "hybrid_triton_w4a16", + "tflops": 6.1512 + }, + { + "batch_size": 128, + "kernel": "hybrid_triton_w4a16", + "tflops": 18.9959 + }, + { + "batch_size": 256, + "kernel": "hybrid_triton_w4a16", + "tflops": 26.4757 + }, + { + "batch_size": 512, + "kernel": "hybrid_triton_w4a16", + "tflops": 26.0136 + }, + { + "batch_size": 1024, + "kernel": "hybrid_triton_w4a16", + "tflops": 25.4233 + }, + { + "batch_size": 2048, + "kernel": "hybrid_triton_w4a16", + "tflops": 22.7112 + }, + { + "batch_size": 4096, + "kernel": "hybrid_triton_w4a16", + "tflops": 21.1973 + } + ] + } + ] + }, + { + "in_features": 2560, + "out_features": 2560, + "group_size": 128, + "comment": "Qwen3-4B o_proj", + "providers": [ + { + "provider": "hybrid-w4a16", + "baselines": [ + { + "batch_size": 1, + "kernel": "wvsplitk_int4", + "tflops": 0.726 + }, + { + "batch_size": 2, + "kernel": "wvsplitk_int4", + "tflops": 1.4351 + }, + { + "batch_size": 4, + "kernel": "wvsplitk_int4", + "tflops": 1.9532 + }, + { + "batch_size": 8, + "kernel": "hybrid_triton_w4a16", + "tflops": 4.394 + }, + { + "batch_size": 16, + "kernel": "hybrid_triton_w4a16", + "tflops": 8.5481 + }, + { + "batch_size": 32, + "kernel": "hybrid_triton_w4a16", + "tflops": 14.0654 + }, + { + "batch_size": 64, + "kernel": "hybrid_triton_w4a16", + "tflops": 15.8297 + }, + { + "batch_size": 128, + "kernel": "hybrid_triton_w4a16", + "tflops": 18.8825 + }, + { + "batch_size": 256, + "kernel": "hybrid_triton_w4a16", + "tflops": 26.9786 + }, + { + "batch_size": 512, + "kernel": "hybrid_triton_w4a16", + "tflops": 30.3899 + }, + { + "batch_size": 1024, + "kernel": "hybrid_triton_w4a16", + "tflops": 30.0081 + }, + { + "batch_size": 2048, + "kernel": "hybrid_triton_w4a16", + "tflops": 29.5283 + }, + { + "batch_size": 4096, + "kernel": "hybrid_triton_w4a16", + "tflops": 30.2194 + } + ] + }, + { + "provider": "hybrid-w4a16-zp", + "baselines": [ + { + "batch_size": 1, + "kernel": "wvsplitk_int4", + "tflops": 0.6892 + }, + { + "batch_size": 2, + "kernel": "wvsplitk_int4", + "tflops": 1.378 + }, + { + "batch_size": 4, + "kernel": "wvsplitk_int4", + "tflops": 1.9479 + }, + { + "batch_size": 8, + "kernel": "hybrid_triton_w4a16", + "tflops": 4.3273 + }, + { + "batch_size": 16, + "kernel": "hybrid_triton_w4a16", + "tflops": 8.4275 + }, + { + "batch_size": 32, + "kernel": "hybrid_triton_w4a16", + "tflops": 13.791 + }, + { + "batch_size": 64, + "kernel": "hybrid_triton_w4a16", + "tflops": 17.6013 + }, + { + "batch_size": 128, + "kernel": "hybrid_triton_w4a16", + "tflops": 18.7139 + }, + { + "batch_size": 256, + "kernel": "hybrid_triton_w4a16", + "tflops": 26.3896 + }, + { + "batch_size": 512, + "kernel": "hybrid_triton_w4a16", + "tflops": 29.908 + }, + { + "batch_size": 1024, + "kernel": "hybrid_triton_w4a16", + "tflops": 29.1436 + }, + { + "batch_size": 2048, + "kernel": "hybrid_triton_w4a16", + "tflops": 30.162 + }, + { + "batch_size": 4096, + "kernel": "hybrid_triton_w4a16", + "tflops": 29.8204 + } + ] + }, + { + "provider": "hybrid-w4a16-bf16", + "baselines": [ + { + "batch_size": 1, + "kernel": "wvsplitk_int4", + "tflops": 0.7209 + }, + { + "batch_size": 2, + "kernel": "wvsplitk_int4", + "tflops": 1.4277 + }, + { + "batch_size": 4, + "kernel": "wvsplitk_int4", + "tflops": 1.9484 + }, + { + "batch_size": 8, + "kernel": "hybrid_triton_w4a16", + "tflops": 3.1062 + }, + { + "batch_size": 16, + "kernel": "hybrid_triton_w4a16", + "tflops": 6.2667 + }, + { + "batch_size": 32, + "kernel": "hybrid_triton_w4a16", + "tflops": 12.322 + }, + { + "batch_size": 64, + "kernel": "hybrid_triton_w4a16", + "tflops": 13.7467 + }, + { + "batch_size": 128, + "kernel": "hybrid_triton_w4a16", + "tflops": 19.0764 + }, + { + "batch_size": 256, + "kernel": "hybrid_triton_w4a16", + "tflops": 20.9364 + }, + { + "batch_size": 512, + "kernel": "hybrid_triton_w4a16", + "tflops": 21.3051 + }, + { + "batch_size": 1024, + "kernel": "hybrid_triton_w4a16", + "tflops": 20.2106 + }, + { + "batch_size": 2048, + "kernel": "hybrid_triton_w4a16", + "tflops": 25.5842 + }, + { + "batch_size": 4096, + "kernel": "hybrid_triton_w4a16", + "tflops": 25.6367 + } + ] + }, + { + "provider": "hybrid-w4a16-zp-bf16", + "baselines": [ + { + "batch_size": 1, + "kernel": "wvsplitk_int4", + "tflops": 0.6974 + }, + { + "batch_size": 2, + "kernel": "wvsplitk_int4", + "tflops": 1.4085 + }, + { + "batch_size": 4, + "kernel": "wvsplitk_int4", + "tflops": 1.9461 + }, + { + "batch_size": 8, + "kernel": "hybrid_triton_w4a16", + "tflops": 3.1308 + }, + { + "batch_size": 16, + "kernel": "hybrid_triton_w4a16", + "tflops": 6.2299 + }, + { + "batch_size": 32, + "kernel": "hybrid_triton_w4a16", + "tflops": 12.3413 + }, + { + "batch_size": 64, + "kernel": "hybrid_triton_w4a16", + "tflops": 13.8039 + }, + { + "batch_size": 128, + "kernel": "hybrid_triton_w4a16", + "tflops": 19.2185 + }, + { + "batch_size": 256, + "kernel": "hybrid_triton_w4a16", + "tflops": 19.972 + }, + { + "batch_size": 512, + "kernel": "hybrid_triton_w4a16", + "tflops": 20.8508 + }, + { + "batch_size": 1024, + "kernel": "hybrid_triton_w4a16", + "tflops": 20.1444 + }, + { + "batch_size": 2048, + "kernel": "hybrid_triton_w4a16", + "tflops": 25.2893 + }, + { + "batch_size": 4096, + "kernel": "hybrid_triton_w4a16", + "tflops": 25.5165 + } + ] + } + ] + }, + { + "in_features": 2560, + "out_features": 3840, + "group_size": 128, + "comment": "Qwen3-4B qkv_proj", + "providers": [ + { + "provider": "hybrid-w4a16", + "baselines": [ + { + "batch_size": 1, + "kernel": "wvsplitk_int4", + "tflops": 0.7781 + }, + { + "batch_size": 2, + "kernel": "wvsplitk_int4", + "tflops": 1.5374 + }, + { + "batch_size": 4, + "kernel": "wvsplitk_int4", + "tflops": 3.0369 + }, + { + "batch_size": 8, + "kernel": "hybrid_triton_w4a16", + "tflops": 5.0776 + }, + { + "batch_size": 16, + "kernel": "hybrid_triton_w4a16", + "tflops": 10.0766 + }, + { + "batch_size": 32, + "kernel": "hybrid_triton_w4a16", + "tflops": 16.4848 + }, + { + "batch_size": 64, + "kernel": "hybrid_triton_w4a16", + "tflops": 21.1671 + }, + { + "batch_size": 128, + "kernel": "hybrid_triton_w4a16", + "tflops": 22.9984 + }, + { + "batch_size": 256, + "kernel": "hybrid_triton_w4a16", + "tflops": 28.4701 + }, + { + "batch_size": 512, + "kernel": "hybrid_triton_w4a16", + "tflops": 31.2348 + }, + { + "batch_size": 1024, + "kernel": "hybrid_triton_w4a16", + "tflops": 30.9706 + }, + { + "batch_size": 2048, + "kernel": "hybrid_triton_w4a16", + "tflops": 31.4709 + }, + { + "batch_size": 4096, + "kernel": "hybrid_triton_w4a16", + "tflops": 32.084 + } + ] + }, + { + "provider": "hybrid-w4a16-zp", + "baselines": [ + { + "batch_size": 1, + "kernel": "wvsplitk_int4", + "tflops": 0.7436 + }, + { + "batch_size": 2, + "kernel": "wvsplitk_int4", + "tflops": 1.4621 + }, + { + "batch_size": 4, + "kernel": "wvsplitk_int4", + "tflops": 2.8975 + }, + { + "batch_size": 8, + "kernel": "hybrid_triton_w4a16", + "tflops": 4.893 + }, + { + "batch_size": 16, + "kernel": "hybrid_triton_w4a16", + "tflops": 9.8271 + }, + { + "batch_size": 32, + "kernel": "hybrid_triton_w4a16", + "tflops": 16.2037 + }, + { + "batch_size": 64, + "kernel": "hybrid_triton_w4a16", + "tflops": 20.3359 + }, + { + "batch_size": 128, + "kernel": "hybrid_triton_w4a16", + "tflops": 22.9433 + }, + { + "batch_size": 256, + "kernel": "hybrid_triton_w4a16", + "tflops": 28.3665 + }, + { + "batch_size": 512, + "kernel": "hybrid_triton_w4a16", + "tflops": 25.4327 + }, + { + "batch_size": 1024, + "kernel": "hybrid_triton_w4a16", + "tflops": 30.8691 + }, + { + "batch_size": 2048, + "kernel": "hybrid_triton_w4a16", + "tflops": 31.0562 + }, + { + "batch_size": 4096, + "kernel": "hybrid_triton_w4a16", + "tflops": 30.7338 + } + ] + }, + { + "provider": "hybrid-w4a16-bf16", + "baselines": [ + { + "batch_size": 1, + "kernel": "wvsplitk_int4", + "tflops": 0.7729 + }, + { + "batch_size": 2, + "kernel": "wvsplitk_int4", + "tflops": 1.5222 + }, + { + "batch_size": 4, + "kernel": "wvsplitk_int4", + "tflops": 3.0508 + }, + { + "batch_size": 8, + "kernel": "hybrid_triton_w4a16", + "tflops": 3.6647 + }, + { + "batch_size": 16, + "kernel": "hybrid_triton_w4a16", + "tflops": 7.2952 + }, + { + "batch_size": 32, + "kernel": "hybrid_triton_w4a16", + "tflops": 14.2748 + }, + { + "batch_size": 64, + "kernel": "hybrid_triton_w4a16", + "tflops": 16.4272 + }, + { + "batch_size": 128, + "kernel": "hybrid_triton_w4a16", + "tflops": 22.9136 + }, + { + "batch_size": 256, + "kernel": "hybrid_triton_w4a16", + "tflops": 21.3368 + }, + { + "batch_size": 512, + "kernel": "hybrid_triton_w4a16", + "tflops": 21.7525 + }, + { + "batch_size": 1024, + "kernel": "hybrid_triton_w4a16", + "tflops": 21.8 + }, + { + "batch_size": 2048, + "kernel": "hybrid_triton_w4a16", + "tflops": 25.7768 + }, + { + "batch_size": 4096, + "kernel": "hybrid_triton_w4a16", + "tflops": 25.8751 + } + ] + }, + { + "provider": "hybrid-w4a16-zp-bf16", + "baselines": [ + { + "batch_size": 1, + "kernel": "wvsplitk_int4", + "tflops": 0.7552 + }, + { + "batch_size": 2, + "kernel": "wvsplitk_int4", + "tflops": 1.5067 + }, + { + "batch_size": 4, + "kernel": "wvsplitk_int4", + "tflops": 2.8611 + }, + { + "batch_size": 8, + "kernel": "hybrid_triton_w4a16", + "tflops": 3.6573 + }, + { + "batch_size": 16, + "kernel": "hybrid_triton_w4a16", + "tflops": 7.0823 + }, + { + "batch_size": 32, + "kernel": "hybrid_triton_w4a16", + "tflops": 14.208 + }, + { + "batch_size": 64, + "kernel": "hybrid_triton_w4a16", + "tflops": 16.2358 + }, + { + "batch_size": 128, + "kernel": "hybrid_triton_w4a16", + "tflops": 22.9965 + }, + { + "batch_size": 256, + "kernel": "hybrid_triton_w4a16", + "tflops": 20.7744 + }, + { + "batch_size": 512, + "kernel": "hybrid_triton_w4a16", + "tflops": 21.1387 + }, + { + "batch_size": 1024, + "kernel": "hybrid_triton_w4a16", + "tflops": 20.6036 + }, + { + "batch_size": 2048, + "kernel": "hybrid_triton_w4a16", + "tflops": 25.8254 + }, + { + "batch_size": 4096, + "kernel": "hybrid_triton_w4a16", + "tflops": 25.8695 + } + ] + } + ] + }, + { + "in_features": 2560, + "out_features": 4096, + "group_size": 128, + "comment": "Gemma3-4B qkv_proj", + "providers": [ + { + "provider": "hybrid-w4a16", + "baselines": [ + { + "batch_size": 1, + "kernel": "wvsplitk_int4", + "tflops": 0.7708 + }, + { + "batch_size": 2, + "kernel": "wvsplitk_int4", + "tflops": 1.4918 + }, + { + "batch_size": 4, + "kernel": "wvsplitk_int4", + "tflops": 3.0638 + }, + { + "batch_size": 8, + "kernel": "hybrid_triton_w4a16", + "tflops": 4.93 + }, + { + "batch_size": 16, + "kernel": "hybrid_triton_w4a16", + "tflops": 9.7878 + }, + { + "batch_size": 32, + "kernel": "hybrid_triton_w4a16", + "tflops": 15.8432 + }, + { + "batch_size": 64, + "kernel": "hybrid_triton_w4a16", + "tflops": 18.9558 + }, + { + "batch_size": 128, + "kernel": "hybrid_triton_w4a16", + "tflops": 21.8328 + }, + { + "batch_size": 256, + "kernel": "hybrid_triton_w4a16", + "tflops": 27.2514 + }, + { + "batch_size": 512, + "kernel": "hybrid_triton_w4a16", + "tflops": 26.6855 + }, + { + "batch_size": 1024, + "kernel": "hybrid_triton_w4a16", + "tflops": 29.0564 + }, + { + "batch_size": 2048, + "kernel": "hybrid_triton_w4a16", + "tflops": 29.8627 + }, + { + "batch_size": 4096, + "kernel": "hybrid_triton_w4a16", + "tflops": 28.1164 + } + ] + }, + { + "provider": "hybrid-w4a16-zp", + "baselines": [ + { + "batch_size": 1, + "kernel": "wvsplitk_int4", + "tflops": 0.7336 + }, + { + "batch_size": 2, + "kernel": "wvsplitk_int4", + "tflops": 1.4161 + }, + { + "batch_size": 4, + "kernel": "wvsplitk_int4", + "tflops": 2.8877 + }, + { + "batch_size": 8, + "kernel": "hybrid_triton_w4a16", + "tflops": 4.8176 + }, + { + "batch_size": 16, + "kernel": "hybrid_triton_w4a16", + "tflops": 9.6441 + }, + { + "batch_size": 32, + "kernel": "hybrid_triton_w4a16", + "tflops": 15.5835 + }, + { + "batch_size": 64, + "kernel": "hybrid_triton_w4a16", + "tflops": 18.582 + }, + { + "batch_size": 128, + "kernel": "hybrid_triton_w4a16", + "tflops": 21.9103 + }, + { + "batch_size": 256, + "kernel": "hybrid_triton_w4a16", + "tflops": 26.6035 + }, + { + "batch_size": 512, + "kernel": "hybrid_triton_w4a16", + "tflops": 25.5331 + }, + { + "batch_size": 1024, + "kernel": "hybrid_triton_w4a16", + "tflops": 26.7291 + }, + { + "batch_size": 2048, + "kernel": "hybrid_triton_w4a16", + "tflops": 28.842 + }, + { + "batch_size": 4096, + "kernel": "hybrid_triton_w4a16", + "tflops": 28.0576 + } + ] + }, + { + "provider": "hybrid-w4a16-bf16", + "baselines": [ + { + "batch_size": 1, + "kernel": "wvsplitk_int4", + "tflops": 0.7689 + }, + { + "batch_size": 2, + "kernel": "wvsplitk_int4", + "tflops": 1.4843 + }, + { + "batch_size": 4, + "kernel": "wvsplitk_int4", + "tflops": 3.0737 + }, + { + "batch_size": 8, + "kernel": "hybrid_triton_w4a16", + "tflops": 3.5694 + }, + { + "batch_size": 16, + "kernel": "hybrid_triton_w4a16", + "tflops": 7.1843 + }, + { + "batch_size": 32, + "kernel": "hybrid_triton_w4a16", + "tflops": 14.0153 + }, + { + "batch_size": 64, + "kernel": "hybrid_triton_w4a16", + "tflops": 15.449 + }, + { + "batch_size": 128, + "kernel": "hybrid_triton_w4a16", + "tflops": 21.7312 + }, + { + "batch_size": 256, + "kernel": "hybrid_triton_w4a16", + "tflops": 20.5059 + }, + { + "batch_size": 512, + "kernel": "hybrid_triton_w4a16", + "tflops": 19.5546 + }, + { + "batch_size": 1024, + "kernel": "hybrid_triton_w4a16", + "tflops": 20.7363 + }, + { + "batch_size": 2048, + "kernel": "hybrid_triton_w4a16", + "tflops": 25.2697 + }, + { + "batch_size": 4096, + "kernel": "hybrid_triton_w4a16", + "tflops": 23.1942 + } + ] + }, + { + "provider": "hybrid-w4a16-zp-bf16", + "baselines": [ + { + "batch_size": 1, + "kernel": "wvsplitk_int4", + "tflops": 0.7461 + }, + { + "batch_size": 2, + "kernel": "wvsplitk_int4", + "tflops": 1.4452 + }, + { + "batch_size": 4, + "kernel": "wvsplitk_int4", + "tflops": 2.8414 + }, + { + "batch_size": 8, + "kernel": "hybrid_triton_w4a16", + "tflops": 3.5876 + }, + { + "batch_size": 16, + "kernel": "hybrid_triton_w4a16", + "tflops": 7.1596 + }, + { + "batch_size": 32, + "kernel": "hybrid_triton_w4a16", + "tflops": 13.9781 + }, + { + "batch_size": 64, + "kernel": "hybrid_triton_w4a16", + "tflops": 15.3658 + }, + { + "batch_size": 128, + "kernel": "hybrid_triton_w4a16", + "tflops": 21.6818 + }, + { + "batch_size": 256, + "kernel": "hybrid_triton_w4a16", + "tflops": 19.9443 + }, + { + "batch_size": 512, + "kernel": "hybrid_triton_w4a16", + "tflops": 19.1908 + }, + { + "batch_size": 1024, + "kernel": "hybrid_triton_w4a16", + "tflops": 19.523 + }, + { + "batch_size": 2048, + "kernel": "hybrid_triton_w4a16", + "tflops": 25.3414 + }, + { + "batch_size": 4096, + "kernel": "hybrid_triton_w4a16", + "tflops": 22.8638 + } + ] + } + ] + }, + { + "in_features": 2560, + "out_features": 19456, + "group_size": 128, + "comment": "Qwen3-4B gate_up_proj", "providers": [ { "provider": "hybrid-w4a16", @@ -589,67 +2317,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.9247 + "tflops": 0.8647 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.7686 + "tflops": 1.6377 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 3.4351 + "tflops": 3.1993 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 4.3068 + "tflops": 5.5504 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 8.2607 + "tflops": 10.7647 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 16.0032 + "tflops": 18.5179 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 9.257 + "tflops": 22.6372 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 20.6918 + "tflops": 26.9649 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 24.2849 + "tflops": 28.2119 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 24.5337 + "tflops": 30.7708 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 23.9014 + "tflops": 31.8231 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 21.6818 + "tflops": 31.4502 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 19.6505 + "tflops": 31.8407 } ] }, @@ -659,67 +2387,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.8991 + "tflops": 0.8326 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.6974 + "tflops": 1.5893 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 3.2795 + "tflops": 3.1446 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 4.0476 + "tflops": 5.3388 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 7.9376 + "tflops": 10.5956 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 14.9119 + "tflops": 18.4664 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 7.4613 + "tflops": 22.227 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 20.293 + "tflops": 24.511 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 24.0634 + "tflops": 27.2131 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 24.3204 + "tflops": 29.0696 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 23.6918 + "tflops": 29.2548 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 21.3473 + "tflops": 30.0238 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 19.499 + "tflops": 29.851 } ] }, @@ -729,67 +2457,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.9152 + "tflops": 0.8512 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.758 + "tflops": 1.655 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 3.3974 + "tflops": 3.2214 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 4.0196 + "tflops": 3.9624 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 7.8085 + "tflops": 7.9828 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 15.1246 + "tflops": 15.3018 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 9.0186 + "tflops": 19.8902 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 22.442 + "tflops": 24.4771 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 25.9514 + "tflops": 26.0495 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 25.9164 + "tflops": 26.5034 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 24.9292 + "tflops": 26.6319 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 22.6886 + "tflops": 26.3667 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 21.0021 + "tflops": 25.9706 } ] }, @@ -799,67 +2527,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.8995 + "tflops": 0.8412 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.6913 + "tflops": 1.5981 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 3.1925 + "tflops": 3.0966 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 3.8311 + "tflops": 4.036 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 7.4835 + "tflops": 7.9254 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 14.5089 + "tflops": 15.6375 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 7.34 + "tflops": 15.53 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 20.5148 + "tflops": 24.6339 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 25.7319 + "tflops": 26.0013 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 25.7786 + "tflops": 26.331 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 24.832 + "tflops": 26.4185 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 22.711 + "tflops": 26.3361 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 20.923 + "tflops": 25.8195 } ] } @@ -867,9 +2595,9 @@ }, { "in_features": 2560, - "out_features": 2560, + "out_features": 20480, "group_size": 128, - "comment": "Qwen3-4B o_proj", + "comment": "Gemma3-4B gate_up_proj", "providers": [ { "provider": "hybrid-w4a16", @@ -877,67 +2605,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 1.0641 + "tflops": 0.8659 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.8888 + "tflops": 1.6824 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 1.964 + "tflops": 3.32 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 4.2454 + "tflops": 5.7774 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 8.3269 + "tflops": 11.2576 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 15.3406 + "tflops": 19.6447 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 15.9707 + "tflops": 23.5209 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 18.9499 + "tflops": 28.2759 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 20.3059 + "tflops": 29.3752 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 22.1459 + "tflops": 31.3016 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 22.8095 + "tflops": 32.3593 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 23.7538 + "tflops": 32.3543 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 23.9946 + "tflops": 31.8623 } ] }, @@ -947,67 +2675,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.968 + "tflops": 0.8018 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.7862 + "tflops": 1.5962 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 1.9541 + "tflops": 3.2303 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 4.1948 + "tflops": 5.5432 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 8.0128 + "tflops": 10.8348 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 15.5936 + "tflops": 19.199 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 15.3383 + "tflops": 22.951 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 18.4911 + "tflops": 25.9897 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 20.1208 + "tflops": 28.2354 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 22.4179 + "tflops": 30.1167 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 22.8093 + "tflops": 30.383 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 23.6467 + "tflops": 30.0887 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 23.7181 + "tflops": 29.5603 } ] }, @@ -1017,67 +2745,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 1.0738 + "tflops": 0.8325 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.8448 + "tflops": 1.6699 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 1.9545 + "tflops": 3.3335 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 3.9924 + "tflops": 4.093 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 7.9266 + "tflops": 8.0227 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 15.2644 + "tflops": 15.5758 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 17.4998 + "tflops": 20.769 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 20.0723 + "tflops": 24.3729 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 23.1021 + "tflops": 26.124 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 24.022 + "tflops": 26.2409 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 24.2114 + "tflops": 26.1704 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 25.2344 + "tflops": 25.6316 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 25.3184 + "tflops": 23.2542 } ] }, @@ -1087,77 +2815,77 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.9984 + "tflops": 0.8467 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.7989 + "tflops": 1.6289 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 1.9533 + "tflops": 3.1568 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 3.8566 + "tflops": 4.0674 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 7.6189 + "tflops": 8.044 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 14.7116 + "tflops": 15.668 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 17.693 + "tflops": 15.7002 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 19.2561 + "tflops": 24.5155 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 21.4957 + "tflops": 25.9422 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 22.6414 + "tflops": 26.1215 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 22.7318 + "tflops": 25.9152 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 25.1294 + "tflops": 25.2893 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 25.3521 + "tflops": 23.256 } ] } ] }, { - "in_features": 2560, - "out_features": 3840, + "in_features": 3584, + "out_features": 3584, "group_size": 128, - "comment": "Qwen3-4B qkv_proj", + "comment": "Qwen2.5-7B o_proj", "providers": [ { "provider": "hybrid-w4a16", @@ -1165,67 +2893,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.9951 + "tflops": 0.8138 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.8274 + "tflops": 1.5895 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 3.5253 + "tflops": 3.1286 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 4.3449 + "tflops": 5.0353 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 8.5221 + "tflops": 10.1187 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 16.2684 + "tflops": 16.1738 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 17.1006 + "tflops": 19.206 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 22.8312 + "tflops": 22.9001 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 23.5846 + "tflops": 29.125 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 20.577 + "tflops": 31.4384 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 23.3841 + "tflops": 31.4869 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 24.6554 + "tflops": 31.9135 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 24.9148 + "tflops": 31.9798 } ] }, @@ -1235,67 +2963,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.9157 + "tflops": 0.7923 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.7331 + "tflops": 1.4846 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 3.3217 + "tflops": 2.9162 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 4.235 + "tflops": 4.9161 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 8.2574 + "tflops": 9.8624 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 15.7746 + "tflops": 15.5065 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 17.1391 + "tflops": 19.4041 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 23.2947 + "tflops": 21.978 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 23.3903 + "tflops": 28.0788 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 20.7392 + "tflops": 23.8634 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 23.7488 + "tflops": 28.9643 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 24.5053 + "tflops": 29.0309 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 24.716 + "tflops": 27.9875 } ] }, @@ -1305,67 +3033,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.9856 + "tflops": 0.8272 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.827 + "tflops": 1.5829 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 3.4782 + "tflops": 3.1034 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 4.1246 + "tflops": 3.531 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 8.1326 + "tflops": 6.349 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 15.6296 + "tflops": 10.9238 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 19.2748 + "tflops": 12.6554 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 25.0966 + "tflops": 15.7692 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 24.4635 + "tflops": 17.038 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 20.0961 + "tflops": 17.9134 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 24.5463 + "tflops": 18.7209 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 25.7778 + "tflops": 23.1103 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 25.8978 + "tflops": 23.7848 } ] }, @@ -1375,77 +3103,77 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.9305 + "tflops": 0.7149 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.7363 + "tflops": 1.3286 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 3.284 + "tflops": 2.5558 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 4.0933 + "tflops": 2.5802 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 8.1012 + "tflops": 5.1835 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 15.6989 + "tflops": 10.5165 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 19.8573 + "tflops": 12.5711 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 23.2563 + "tflops": 16.0695 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 23.0665 + "tflops": 15.2853 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 18.8356 + "tflops": 15.7672 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 23.08 + "tflops": 15.7253 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 25.6339 + "tflops": 19.6306 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 25.7817 + "tflops": 19.8578 } ] } ] }, { - "in_features": 2560, - "out_features": 19456, + "in_features": 3584, + "out_features": 4608, "group_size": 128, - "comment": "Qwen3-4B gate_up_proj", + "comment": "Qwen2.5-7B qkv_proj", "providers": [ { "provider": "hybrid-w4a16", @@ -1453,67 +3181,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.8928 + "tflops": 0.7335 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.7141 + "tflops": 1.3739 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 3.2879 + "tflops": 2.7008 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 4.359 + "tflops": 4.3965 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 8.489 + "tflops": 8.6511 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 16.3508 + "tflops": 12.4757 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 21.7559 + "tflops": 16.1312 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 23.7674 + "tflops": 18.305 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 24.238 + "tflops": 21.7571 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 24.5302 + "tflops": 20.9852 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 24.3167 + "tflops": 22.6397 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 24.2346 + "tflops": 24.4099 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 24.0488 + "tflops": 24.413 } ] }, @@ -1523,67 +3251,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.854 + "tflops": 0.7174 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.6112 + "tflops": 1.3187 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 3.1849 + "tflops": 2.5678 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 4.1882 + "tflops": 4.2846 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 8.2449 + "tflops": 8.4968 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 15.686 + "tflops": 12.3798 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 15.3656 + "tflops": 15.6576 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 22.8964 + "tflops": 17.954 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 23.6275 + "tflops": 21.0857 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 23.9546 + "tflops": 20.6773 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 24.1914 + "tflops": 21.2765 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 24.057 + "tflops": 21.5989 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 23.812 + "tflops": 22.6052 } ] }, @@ -1593,67 +3321,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.8868 + "tflops": 0.7804 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.6983 + "tflops": 1.4558 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 3.3113 + "tflops": 2.9641 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 4.2113 + "tflops": 3.0299 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 8.2385 + "tflops": 6.0481 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 16.1452 + "tflops": 11.9612 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 21.0316 + "tflops": 14.7054 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 25.6797 + "tflops": 19.2536 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 25.1757 + "tflops": 17.9079 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 25.594 + "tflops": 17.9907 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 25.6878 + "tflops": 18.84 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 25.7688 + "tflops": 23.5197 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 25.5703 + "tflops": 24.2473 } ] }, @@ -1663,67 +3391,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.8546 + "tflops": 0.8234 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.638 + "tflops": 1.5544 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 3.1231 + "tflops": 2.9885 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 4.132 + "tflops": 3.3552 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 8.023 + "tflops": 6.6424 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 15.8907 + "tflops": 13.3038 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 14.5537 + "tflops": 16.4524 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 23.7896 + "tflops": 22.1102 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 25.3109 + "tflops": 20.2235 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 25.5795 + "tflops": 19.5842 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 25.4976 + "tflops": 20.5878 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 25.4903 + "tflops": 24.8453 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 25.5424 + "tflops": 25.5954 } ] } @@ -1731,9 +3459,9 @@ }, { "in_features": 3584, - "out_features": 3584, + "out_features": 37888, "group_size": 128, - "comment": "Qwen2.5-7B o_proj", + "comment": "Qwen2.5-7B gate_up_proj", "providers": [ { "provider": "hybrid-w4a16", @@ -1741,67 +3469,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.9835 + "tflops": 0.9246 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.8347 + "tflops": 1.7643 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 3.5694 + "tflops": 3.4354 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 4.1539 + "tflops": 5.9348 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 7.8149 + "tflops": 11.6618 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 15.8862 + "tflops": 19.3069 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 16.8782 + "tflops": 23.1961 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 18.0556 + "tflops": 26.6591 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 22.0844 + "tflops": 28.6531 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 20.633 + "tflops": 31.7601 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 23.0266 + "tflops": 31.4982 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 24.5028 + "tflops": 31.6592 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 24.4662 + "tflops": 31.9288 } ] }, @@ -1811,67 +3539,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.9292 + "tflops": 0.9225 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.7102 + "tflops": 1.705 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 3.3607 + "tflops": 3.4263 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 4.1325 + "tflops": 5.319 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 7.9536 + "tflops": 10.6027 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 15.5861 + "tflops": 19.0655 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 16.4198 + "tflops": 23.0875 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 17.5615 + "tflops": 24.139 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 22.0996 + "tflops": 27.7383 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 20.7903 + "tflops": 30.9645 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 22.6602 + "tflops": 30.8207 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 24.2616 + "tflops": 31.3725 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 24.5029 + "tflops": 31.5699 } ] }, @@ -1881,67 +3609,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.9774 + "tflops": 0.941 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.802 + "tflops": 1.757 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 3.5337 + "tflops": 3.3588 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 4.0088 + "tflops": 4.1537 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 7.9151 + "tflops": 8.3801 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 15.4545 + "tflops": 16.2178 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 18.795 + "tflops": 18.8702 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 18.476 + "tflops": 25.2072 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 23.5884 + "tflops": 26.3965 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 19.2127 + "tflops": 26.4223 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 23.6789 + "tflops": 26.4051 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 25.5239 + "tflops": 26.1312 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 25.7562 + "tflops": 25.7068 } ] }, @@ -1951,77 +3679,77 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.9331 + "tflops": 0.8812 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.7483 + "tflops": 1.6457 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 3.325 + "tflops": 3.2888 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 3.9838 + "tflops": 4.1941 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 7.8087 + "tflops": 8.5888 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 15.214 + "tflops": 16.6511 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 18.6939 + "tflops": 14.672 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 17.975 + "tflops": 24.4108 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 22.3571 + "tflops": 25.6064 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 18.1997 + "tflops": 25.8391 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 22.0545 + "tflops": 25.947 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 25.4933 + "tflops": 26.2078 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 25.5425 + "tflops": 25.8985 } ] } ] }, { - "in_features": 3584, - "out_features": 4608, + "in_features": 4096, + "out_features": 4096, "group_size": 128, - "comment": "Qwen2.5-7B qkv_proj", + "comment": "Qwen3-8B o_proj", "providers": [ { "provider": "hybrid-w4a16", @@ -2029,67 +3757,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.9585 + "tflops": 0.826 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.7914 + "tflops": 1.5806 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 3.5332 + "tflops": 3.2483 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 4.1761 + "tflops": 5.0043 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 7.4575 + "tflops": 10.0672 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 16.2209 + "tflops": 14.7996 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 18.5579 + "tflops": 19.5855 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 22.0935 + "tflops": 22.589 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 21.567 + "tflops": 27.8125 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 22.2707 + "tflops": 26.882 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 22.8004 + "tflops": 29.3288 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 24.4321 + "tflops": 30.0921 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 24.7165 + "tflops": 25.5844 } ] }, @@ -2099,217 +3827,217 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.9067 + "tflops": 0.8084 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.669 + "tflops": 1.5294 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 3.355 + "tflops": 3.0087 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 3.8338 + "tflops": 4.9991 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 7.2186 + "tflops": 9.7792 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 14.2532 + "tflops": 14.7202 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 17.164 + "tflops": 19.6429 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 21.2933 + "tflops": 22.6179 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 20.9253 + "tflops": 27.1684 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 21.5604 + "tflops": 26.232 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 22.6155 + "tflops": 26.4359 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 24.303 + "tflops": 28.4418 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 24.6696 + "tflops": 23.9736 } ] }, { - "provider": "hybrid-w4a16-bf16", + "provider": "hybrid-w4a16-zp-bf16", "baselines": [ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.9427 + "tflops": 0.8071 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.7606 + "tflops": 1.5568 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 3.4726 + "tflops": 2.9738 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 3.6749 + "tflops": 3.5024 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 7.3571 + "tflops": 6.9302 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 14.5447 + "tflops": 13.7318 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 18.9263 + "tflops": 15.8748 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 23.929 + "tflops": 17.7086 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 23.2479 + "tflops": 20.5433 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 22.8656 + "tflops": 17.912 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 23.1451 + "tflops": 19.474 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 25.7038 + "tflops": 25.5985 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 25.6065 + "tflops": 19.1745 } ] }, { - "provider": "hybrid-w4a16-zp-bf16", + "provider": "hybrid-w4a16-bf16", "baselines": [ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.9113 + "tflops": 0.8236 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.7064 + "tflops": 1.5975 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 3.307 + "tflops": 3.2368 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 3.6973 + "tflops": 3.5562 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 7.3129 + "tflops": 6.9508 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 14.4331 + "tflops": 13.4244 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 19.4637 + "tflops": 15.7733 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 22.2201 + "tflops": 17.4596 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 21.6125 + "tflops": 21.0493 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 21.4651 + "tflops": 20.4634 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 21.627 + "tflops": 20.5142 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 25.4997 + "tflops": 25.0307 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 25.0739 + "tflops": 19.0773 } ] } ] }, { - "in_features": 3584, - "out_features": 37888, + "in_features": 4096, + "out_features": 6144, "group_size": 128, - "comment": "Qwen2.5-7B gate_up_proj", + "comment": "Qwen3-8B qkv_proj", "providers": [ { "provider": "hybrid-w4a16", @@ -2317,67 +4045,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.9485 + "tflops": 0.8631 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.7864 + "tflops": 1.6475 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 3.4789 + "tflops": 3.3719 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 4.495 + "tflops": 5.4527 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 8.715 + "tflops": 10.8605 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 16.5926 + "tflops": 17.7081 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 22.2507 + "tflops": 23.2434 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 23.5897 + "tflops": 27.0153 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 24.9993 + "tflops": 31.0421 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 25.0296 + "tflops": 28.4302 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 25.158 + "tflops": 30.479 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 25.1095 + "tflops": 31.6088 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 24.6831 + "tflops": 27.2999 } ] }, @@ -2387,67 +4115,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.9442 + "tflops": 0.8397 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.7205 + "tflops": 1.5927 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 3.4637 + "tflops": 3.2676 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 4.3038 + "tflops": 5.2391 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 8.2927 + "tflops": 10.4482 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 16.0686 + "tflops": 16.8095 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 14.764 + "tflops": 22.302 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 23.2801 + "tflops": 25.7006 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 24.3827 + "tflops": 29.7518 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 24.6459 + "tflops": 26.5371 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 24.7479 + "tflops": 30.1186 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 24.7876 + "tflops": 29.7291 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 24.2924 + "tflops": 24.3903 } ] }, @@ -2457,67 +4185,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.9278 + "tflops": 0.8589 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.7707 + "tflops": 1.6619 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 3.4841 + "tflops": 3.3578 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 4.2226 + "tflops": 3.9212 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 8.2607 + "tflops": 7.7321 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 15.9568 + "tflops": 15.1735 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 21.5344 + "tflops": 19.3409 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 26.0977 + "tflops": 18.0147 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 25.8018 + "tflops": 21.7658 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 25.8168 + "tflops": 21.3366 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 25.9784 + "tflops": 21.1293 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 26.19 + "tflops": 25.5839 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 25.7026 + "tflops": 18.059 } ] }, @@ -2527,67 +4255,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.922 + "tflops": 0.817 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.7209 + "tflops": 1.6272 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 3.375 + "tflops": 3.2128 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 3.9634 + "tflops": 3.9135 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 7.8804 + "tflops": 7.7407 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 15.5574 + "tflops": 15.2132 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 14.6318 + "tflops": 19.0793 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 23.7604 + "tflops": 18.3983 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 25.6745 + "tflops": 21.1264 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 25.4901 + "tflops": 20.5718 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 25.5034 + "tflops": 20.5937 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 23.9144 + "tflops": 25.5724 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 25.5529 + "tflops": 18.1877 } ] } @@ -2595,9 +4323,9 @@ }, { "in_features": 4096, - "out_features": 4096, + "out_features": 24576, "group_size": 128, - "comment": "Qwen3-8B o_proj", + "comment": "Qwen3-8B gate_up_proj", "providers": [ { "provider": "hybrid-w4a16", @@ -2605,67 +4333,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.8801 + "tflops": 0.9057 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.5812 + "tflops": 1.7473 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 2.9409 + "tflops": 3.5206 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 2.5406 + "tflops": 5.6267 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 5.0901 + "tflops": 11.1825 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 9.9565 + "tflops": 18.9352 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 5.3127 + "tflops": 23.9465 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 18.489 + "tflops": 26.9945 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 7.6342 + "tflops": 28.9917 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 10.0903 + "tflops": 31.6904 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 16.7122 + "tflops": 32.3996 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 24.6697 + "tflops": 32.1455 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 18.1801 + "tflops": 31.0965 } ] }, @@ -2675,207 +4403,207 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.8389 + "tflops": 0.893 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.5446 + "tflops": 1.6678 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 2.6576 + "tflops": 3.4196 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 2.4447 + "tflops": 4.6447 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 5.3615 + "tflops": 9.3708 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 9.263 + "tflops": 18.5302 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 4.8619 + "tflops": 23.4372 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 17.686 + "tflops": 23.2885 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 6.6291 + "tflops": 26.889 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 9.03 + "tflops": 30.5017 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 15.7897 + "tflops": 30.6714 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 24.4615 + "tflops": 31.0875 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 18.2463 + "tflops": 28.0848 } ] }, { - "provider": "hybrid-w4a16-zp-bf16", + "provider": "hybrid-w4a16-bf16", "baselines": [ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.8441 + "tflops": 0.887 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.5661 + "tflops": 1.6886 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 2.6097 + "tflops": 3.3612 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 2.2277 + "tflops": 4.2315 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 4.4826 + "tflops": 8.3187 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 8.8083 + "tflops": 16.0906 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 4.9037 + "tflops": 17.6867 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 17.8947 + "tflops": 18.7925 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 6.5683 + "tflops": 26.1562 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 9.0265 + "tflops": 26.3951 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 15.7467 + "tflops": 26.2272 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 25.01 + "tflops": 26.234 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 18.4844 + "tflops": 22.4411 } ] }, { - "provider": "hybrid-w4a16-bf16", + "provider": "hybrid-w4a16-zp-bf16", "baselines": [ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.8726 + "tflops": 0.8855 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.62 + "tflops": 1.7071 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 2.9255 + "tflops": 3.3326 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 2.4974 + "tflops": 4.2862 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 4.9684 + "tflops": 8.4451 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 9.5407 + "tflops": 16.4941 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 5.3852 + "tflops": 14.3988 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 17.9886 + "tflops": 19.3154 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 7.5107 + "tflops": 25.9021 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 10.2528 + "tflops": 26.1148 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 17.0682 + "tflops": 26.0562 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 25.1099 + "tflops": 26.0502 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 18.4845 + "tflops": 22.2817 } ] } @@ -2883,9 +4611,9 @@ }, { "in_features": 4096, - "out_features": 6144, + "out_features": 2048, "group_size": 128, - "comment": "Qwen3-8B qkv_proj", + "comment": "Qwen3.5-35B-A3B GDN out_proj", "providers": [ { "provider": "hybrid-w4a16", @@ -2893,67 +4621,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.876 + "tflops": 0.7586 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.6678 + "tflops": 1.4318 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 2.7312 + "tflops": 1.9255 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 2.1969 + "tflops": 4.0737 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 4.3383 + "tflops": 8.0824 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 8.4491 + "tflops": 12.5709 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 4.7209 + "tflops": 17.2694 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 20.1226 + "tflops": 20.9973 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 5.972 + "tflops": 23.8807 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 11.0402 + "tflops": 26.9146 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 19.3295 + "tflops": 26.403 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 24.4893 + "tflops": 28.7191 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 17.3335 + "tflops": 27.2514 } ] }, @@ -2963,67 +4691,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.8636 + "tflops": 0.7205 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.5756 + "tflops": 1.3476 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 2.6067 + "tflops": 1.89 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 2.2277 + "tflops": 3.9154 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 4.341 + "tflops": 7.8952 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 8.1686 + "tflops": 12.3226 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 4.5626 + "tflops": 16.85 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 19.2137 + "tflops": 19.9671 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 5.2924 + "tflops": 22.2603 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 9.8484 + "tflops": 26.2996 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 18.0353 + "tflops": 26.0514 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 24.6201 + "tflops": 28.0285 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 17.3055 + "tflops": 26.2405 } ] }, @@ -3033,67 +4761,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.8996 + "tflops": 0.754 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.6717 + "tflops": 1.4205 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 2.7599 + "tflops": 1.9021 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 2.2078 + "tflops": 2.8105 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 4.4118 + "tflops": 5.6154 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 8.2191 + "tflops": 11.1128 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 4.7325 + "tflops": 11.5423 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 19.5135 + "tflops": 17.4901 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 5.9658 + "tflops": 22.2625 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 11.2653 + "tflops": 22.1465 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 19.4455 + "tflops": 24.3898 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 24.9241 + "tflops": 16.6233 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 17.6045 + "tflops": 15.8406 } ] }, @@ -3103,77 +4831,77 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.8511 + "tflops": 0.7401 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.6016 + "tflops": 1.3832 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 2.567 + "tflops": 1.8632 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 2.0103 + "tflops": 2.7404 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 4.0027 + "tflops": 5.4791 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 7.7614 + "tflops": 10.6259 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 4.5797 + "tflops": 11.3822 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 19.4189 + "tflops": 17.4478 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 5.1948 + "tflops": 22.4379 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 10.0063 + "tflops": 22.0313 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 17.957 + "tflops": 24.5822 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 24.6564 + "tflops": 16.7232 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 17.3753 + "tflops": 16.7872 } ] } ] }, { - "in_features": 4096, - "out_features": 24576, + "in_features": 6144, + "out_features": 2048, "group_size": 128, - "comment": "Qwen3-8B gate_up_proj", + "comment": "Qwen3-1.7B down_proj", "providers": [ { "provider": "hybrid-w4a16", @@ -3181,67 +4909,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.9091 + "tflops": 0.7948 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.7382 + "tflops": 1.4507 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 3.3076 + "tflops": 1.9841 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 2.6973 + "tflops": 2.602 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 4.954 + "tflops": 5.0857 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 9.139 + "tflops": 8.7797 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 4.0481 + "tflops": 14.873 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 19.8157 + "tflops": 18.6254 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 23.7086 + "tflops": 20.6027 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 25.3975 + "tflops": 24.9511 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 25.2058 + "tflops": 24.2417 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 25.2144 + "tflops": 25.8505 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 20.1374 + "tflops": 20.1372 } ] }, @@ -3251,67 +4979,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.8986 + "tflops": 0.7464 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.6747 + "tflops": 1.348 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 3.2698 + "tflops": 1.9514 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 2.2405 + "tflops": 2.5263 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 4.3141 + "tflops": 5.0572 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 8.6709 + "tflops": 8.7844 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 3.2115 + "tflops": 14.7692 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 18.8777 + "tflops": 17.537 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 23.1642 + "tflops": 20.0715 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 24.7568 + "tflops": 24.7554 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 25.1365 + "tflops": 24.4962 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 25.0005 + "tflops": 23.9409 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 20.0364 + "tflops": 17.959 } ] }, @@ -3321,67 +5049,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.9054 + "tflops": 0.7892 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.738 + "tflops": 1.4355 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 3.3016 + "tflops": 1.9638 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 2.3695 + "tflops": 2.1855 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 4.5947 + "tflops": 4.3328 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 8.6985 + "tflops": 8.7341 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 4.328 + "tflops": 9.7736 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 19.5395 + "tflops": 17.711 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 23.745 + "tflops": 23.1354 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 25.4509 + "tflops": 22.1119 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 25.593 + "tflops": 24.3709 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 25.5 + "tflops": 15.8191 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 20.4605 + "tflops": 14.1458 } ] }, @@ -3391,77 +5119,77 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.8882 + "tflops": 0.7632 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.6885 + "tflops": 1.3794 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 3.1548 + "tflops": 1.919 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 2.2772 + "tflops": 2.1691 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 4.5175 + "tflops": 4.3106 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 8.5687 + "tflops": 8.4619 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 3.2218 + "tflops": 9.6742 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 19.7153 + "tflops": 17.7427 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 23.1478 + "tflops": 22.8138 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 25.1433 + "tflops": 21.3156 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 25.2589 + "tflops": 24.4483 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 25.4194 + "tflops": 15.7667 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 20.3847 + "tflops": 13.3809 } ] } ] }, { - "in_features": 4096, - "out_features": 2048, + "in_features": 8192, + "out_features": 512, "group_size": 128, - "comment": "Qwen3.5-35B-A3B GDN out_proj", + "comment": "L2 2MiB at", "providers": [ { "provider": "hybrid-w4a16", @@ -3469,67 +5197,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.9383 + "tflops": 0.638 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.6086 + "tflops": 1.1946 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 1.9359 + "tflops": 1.6928 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 3.0483 + "tflops": 1.0986 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 5.989 + "tflops": 2.0572 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 11.9035 + "tflops": 5.2613 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 11.1658 + "tflops": 9.7825 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 17.5029 + "tflops": 12.1252 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 20.1903 + "tflops": 15.2706 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 18.6465 + "tflops": 20.0896 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 20.4598 + "tflops": 19.6045 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 19.8067 + "tflops": 24.8267 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 18.49 + "tflops": 13.3178 } ] }, @@ -3539,67 +5267,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.8659 + "tflops": 0.6155 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.5013 + "tflops": 1.1409 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 1.9112 + "tflops": 1.6651 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 2.6511 + "tflops": 1.097 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 5.2954 + "tflops": 2.0254 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 10.1059 + "tflops": 5.2293 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 10.0962 + "tflops": 9.7364 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 16.4971 + "tflops": 12.2849 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 18.023 + "tflops": 15.3177 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 18.0744 + "tflops": 19.8491 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 20.0834 + "tflops": 19.1049 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 19.0754 + "tflops": 24.8754 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 17.7431 + "tflops": 12.2074 } ] }, @@ -3609,67 +5337,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.8974 + "tflops": 0.6319 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.5892 + "tflops": 1.1881 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 1.9001 + "tflops": 1.6406 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 2.8712 + "tflops": 1.0635 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 5.6464 + "tflops": 2.1369 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 10.8051 + "tflops": 3.6915 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 11.2481 + "tflops": 3.8021 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 17.6082 + "tflops": 8.7086 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 20.6487 + "tflops": 14.6094 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 23.003 + "tflops": 19.607 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 24.9117 + "tflops": 23.588 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 21.4438 + "tflops": 14.0777 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 18.7492 + "tflops": 9.3432 } ] }, @@ -3679,77 +5407,77 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.8679 + "tflops": 0.6271 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.5197 + "tflops": 1.1629 }, { "batch_size": 4, "kernel": "wvsplitk_int4", - "tflops": 1.8586 + "tflops": 1.5839 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 2.6793 + "tflops": 1.0812 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 5.2758 + "tflops": 2.1177 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 10.1368 + "tflops": 3.7397 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 10.4729 + "tflops": 3.7626 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 16.4199 + "tflops": 8.975 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 18.4962 + "tflops": 14.204 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 20.469 + "tflops": 19.3907 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 23.6211 + "tflops": 23.607 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 20.9558 + "tflops": 14.7239 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 18.393 + "tflops": 7.6264 } ] } ] }, { - "in_features": 8192, + "in_features": 8320, "out_features": 512, "group_size": 128, - "comment": "L2 2MiB at", + "comment": "L2 2MiB above", "providers": [ { "provider": "hybrid-w4a16", @@ -3757,67 +5485,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 1.1493 + "tflops": 0.5812 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.583 + "tflops": 1.1268 }, { "batch_size": 4, - "kernel": "wvsplitk_int4", - "tflops": 1.7119 + "kernel": "hybrid_triton_w4a16", + "tflops": 0.4647 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 1.67 + "tflops": 0.9295 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 3.1469 + "tflops": 1.8886 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 5.4256 + "tflops": 5.1433 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 3.5442 + "tflops": 8.6918 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 9.6996 + "tflops": 12.025 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 16.0883 + "tflops": 17.6314 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 18.2808 + "tflops": 21.5646 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 21.8062 + "tflops": 20.9101 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 16.5493 + "tflops": 27.4709 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 14.2201 + "tflops": 27.711 } ] }, @@ -3827,67 +5555,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.9742 + "tflops": 0.5609 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.4265 + "tflops": 1.0802 }, { "batch_size": 4, - "kernel": "wvsplitk_int4", - "tflops": 1.679 + "kernel": "hybrid_triton_w4a16", + "tflops": 0.4343 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 1.5253 + "tflops": 0.8637 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 2.8178 + "tflops": 1.6931 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 5.0579 + "tflops": 4.4462 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 3.3119 + "tflops": 7.4687 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 9.3804 + "tflops": 11.7746 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 15.5742 + "tflops": 17.6047 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 18.3559 + "tflops": 21.5518 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 21.6903 + "tflops": 20.768 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 15.8783 + "tflops": 27.42 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 12.0947 + "tflops": 27.497 } ] }, @@ -3897,67 +5625,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 1.1474 + "tflops": 0.5847 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.5826 + "tflops": 1.1262 }, { "batch_size": 4, - "kernel": "wvsplitk_int4", - "tflops": 1.6762 + "kernel": "hybrid_triton_w4a16", + "tflops": 0.472 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 1.4824 + "tflops": 0.9314 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 2.8497 + "tflops": 1.8899 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 5.0031 + "tflops": 3.7253 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 3.6669 + "tflops": 3.9768 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 9.6239 + "tflops": 8.5834 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 18.3148 + "tflops": 13.9989 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 19.8355 + "tflops": 19.654 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 24.0519 + "tflops": 23.5619 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 16.7656 + "tflops": 16.3944 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 13.5411 + "tflops": 16.4452 } ] }, @@ -3967,77 +5695,77 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.9991 + "tflops": 0.5724 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.4644 + "tflops": 1.1064 }, { "batch_size": 4, - "kernel": "wvsplitk_int4", - "tflops": 1.6134 + "kernel": "hybrid_triton_w4a16", + "tflops": 0.4702 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 1.5056 + "tflops": 0.9301 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 2.9368 + "tflops": 1.8886 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 5.0178 + "tflops": 3.7346 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 3.4393 + "tflops": 3.8911 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 8.8621 + "tflops": 9.0607 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 16.4891 + "tflops": 13.6343 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 18.7655 + "tflops": 19.513 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 22.1401 + "tflops": 23.515 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 15.8708 + "tflops": 16.786 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 11.9382 + "tflops": 16.9304 } ] } ] }, { - "in_features": 8320, - "out_features": 512, + "in_features": 9728, + "out_features": 2560, "group_size": 128, - "comment": "L2 2MiB above", + "comment": "Qwen3-4B down_proj", "providers": [ { "provider": "hybrid-w4a16", @@ -4045,67 +5773,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 1.1777 + "tflops": 0.8585 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.6729 + "tflops": 1.6983 }, { "batch_size": 4, "kernel": "hybrid_triton_w4a16", - "tflops": 0.8472 + "tflops": 2.1137 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 1.685 + "tflops": 4.1637 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 3.3672 + "tflops": 8.5797 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 6.647 + "tflops": 14.077 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 4.5277 + "tflops": 20.6864 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 10.7157 + "tflops": 25.4797 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 16.0478 + "tflops": 28.6848 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 18.5271 + "tflops": 32.8924 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 23.0259 + "tflops": 31.5361 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 19.5149 + "tflops": 31.5616 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 19.8677 + "tflops": 31.4653 } ] }, @@ -4115,67 +5843,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 1.0332 + "tflops": 0.8562 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.5737 + "tflops": 1.6419 }, { "batch_size": 4, "kernel": "hybrid_triton_w4a16", - "tflops": 0.7516 + "tflops": 2.1106 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 1.4951 + "tflops": 4.1726 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 2.9602 + "tflops": 8.4269 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 5.8492 + "tflops": 13.8677 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 4.2091 + "tflops": 20.7142 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 10.8307 + "tflops": 24.7862 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 15.6779 + "tflops": 28.2042 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 18.411 + "tflops": 32.5585 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 22.7803 + "tflops": 32.4908 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 19.3718 + "tflops": 32.1383 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 20.2113 + "tflops": 29.5514 } ] }, @@ -4185,67 +5913,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 1.1746 + "tflops": 0.8484 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.6628 + "tflops": 1.6919 }, { "batch_size": 4, "kernel": "hybrid_triton_w4a16", - "tflops": 0.758 + "tflops": 1.6679 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 1.5047 + "tflops": 3.209 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 2.9989 + "tflops": 6.4124 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 5.9566 + "tflops": 12.8949 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 4.5895 + "tflops": 14.8644 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 10.441 + "tflops": 22.2763 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 18.8975 + "tflops": 24.5141 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 20.5872 + "tflops": 24.0315 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 24.6901 + "tflops": 24.8682 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 20.2411 + "tflops": 19.187 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 20.9146 + "tflops": 19.2818 } ] }, @@ -4255,77 +5983,77 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 1.0683 + "tflops": 0.8478 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.5969 + "tflops": 1.6862 }, { "batch_size": 4, "kernel": "hybrid_triton_w4a16", - "tflops": 0.7481 + "tflops": 1.6951 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 1.4903 + "tflops": 3.3509 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 2.9715 + "tflops": 6.5163 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 5.8624 + "tflops": 12.7566 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 4.4103 + "tflops": 14.5913 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 9.9836 + "tflops": 22.3099 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 16.5573 + "tflops": 25.0978 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 18.7919 + "tflops": 24.1573 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 23.0312 + "tflops": 24.8718 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 19.9999 + "tflops": 20.1576 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 20.902 + "tflops": 20.0642 } ] } ] }, { - "in_features": 9728, + "in_features": 10240, "out_features": 2560, "group_size": 128, - "comment": "Qwen3-4B down_proj", + "comment": "Gemma3-4B down_proj", "providers": [ { "provider": "hybrid-w4a16", @@ -4333,67 +6061,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.9361 + "tflops": 0.8683 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.8381 + "tflops": 1.63 }, { "batch_size": 4, "kernel": "hybrid_triton_w4a16", - "tflops": 2.1756 + "tflops": 1.4133 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 4.1222 + "tflops": 2.8213 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 8.3762 + "tflops": 5.4544 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 15.3302 + "tflops": 9.9464 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 15.7077 + "tflops": 16.2799 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 21.8582 + "tflops": 16.6978 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 24.4545 + "tflops": 21.6647 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 24.053 + "tflops": 25.2363 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 24.4355 + "tflops": 27.9558 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 23.6322 + "tflops": 24.7802 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 23.6437 + "tflops": 20.1959 } ] }, @@ -4403,67 +6131,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.9072 + "tflops": 0.8474 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.7243 + "tflops": 1.4367 }, { "batch_size": 4, "kernel": "hybrid_triton_w4a16", - "tflops": 2.1943 + "tflops": 1.368 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 4.2395 + "tflops": 2.7599 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 8.208 + "tflops": 5.4085 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 16.0093 + "tflops": 9.8307 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 15.4266 + "tflops": 15.8017 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 23.9891 + "tflops": 14.9904 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 23.7865 + "tflops": 18.336 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 23.2798 + "tflops": 22.707 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 24.1159 + "tflops": 28.0761 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 22.8202 + "tflops": 20.1682 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 22.0967 + "tflops": 15.3161 } ] }, @@ -4473,67 +6201,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.9103 + "tflops": 0.8809 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.8069 + "tflops": 1.609 }, { "batch_size": 4, "kernel": "hybrid_triton_w4a16", - "tflops": 2.1057 + "tflops": 1.252 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 4.1229 + "tflops": 2.4915 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 8.0928 + "tflops": 4.957 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 15.8411 + "tflops": 9.7702 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 16.668 + "tflops": 11.3783 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 23.4208 + "tflops": 20.8748 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 26.2979 + "tflops": 24.1608 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 26.768 + "tflops": 23.9023 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 26.3123 + "tflops": 24.639 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 23.9842 + "tflops": 16.1427 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 24.0119 + "tflops": 14.4625 } ] }, @@ -4543,67 +6271,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.8914 + "tflops": 0.8523 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.7599 + "tflops": 1.5014 }, { "batch_size": 4, "kernel": "hybrid_triton_w4a16", - "tflops": 1.9605 + "tflops": 1.213 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 3.7542 + "tflops": 2.4481 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 7.7122 + "tflops": 4.8248 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 15.0475 + "tflops": 9.4892 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 16.5715 + "tflops": 10.969 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 22.5244 + "tflops": 20.5487 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 23.6942 + "tflops": 21.6901 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 24.1609 + "tflops": 23.2895 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 23.9898 + "tflops": 24.5578 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 23.1156 + "tflops": 16.9695 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 22.9039 + "tflops": 15.2497 } ] } @@ -4621,67 +6349,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.8492 + "tflops": 0.8432 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.6561 + "tflops": 1.6461 }, { "batch_size": 4, "kernel": "hybrid_triton_w4a16", - "tflops": 1.7992 + "tflops": 2.5857 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 3.6172 + "tflops": 5.1519 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 7.2098 + "tflops": 10.3237 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 13.7937 + "tflops": 16.4595 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 9.163 + "tflops": 22.5669 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 18.7062 + "tflops": 24.9405 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 17.657 + "tflops": 27.8215 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 22.0426 + "tflops": 26.1259 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 24.3151 + "tflops": 28.1282 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 16.9969 + "tflops": 23.3631 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 16.6227 + "tflops": 19.9182 } ] }, @@ -4691,67 +6419,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.8005 + "tflops": 0.8157 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.5564 + "tflops": 1.5775 }, { "batch_size": 4, "kernel": "hybrid_triton_w4a16", - "tflops": 1.6645 + "tflops": 2.5194 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 3.2999 + "tflops": 5.0583 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 6.5801 + "tflops": 10.0672 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 12.6058 + "tflops": 16.0018 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 6.9698 + "tflops": 22.3521 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 17.945 + "tflops": 24.2307 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 15.6539 + "tflops": 27.2033 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 21.0149 + "tflops": 27.8041 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 23.8531 + "tflops": 26.3891 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 15.6563 + "tflops": 18.0602 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 15.1108 + "tflops": 16.4731 } ] }, @@ -4761,67 +6489,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.8386 + "tflops": 0.8379 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.656 + "tflops": 1.6419 }, { "batch_size": 4, "kernel": "hybrid_triton_w4a16", - "tflops": 1.7196 + "tflops": 1.9477 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 3.4173 + "tflops": 3.8561 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 6.7982 + "tflops": 7.5961 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 13.1231 + "tflops": 14.8745 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 9.8564 + "tflops": 16.7807 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 17.8348 + "tflops": 17.2658 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 17.4486 + "tflops": 23.5003 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 22.8572 + "tflops": 24.93 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 25.0252 + "tflops": 25.0891 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 16.1874 + "tflops": 13.4727 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 16.1473 + "tflops": 13.1094 } ] }, @@ -4831,67 +6559,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.7746 + "tflops": 0.8295 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.5219 + "tflops": 1.6123 }, { "batch_size": 4, "kernel": "hybrid_triton_w4a16", - "tflops": 1.497 + "tflops": 1.9554 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 3.1086 + "tflops": 3.8718 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 6.2707 + "tflops": 7.6381 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 12.0282 + "tflops": 14.8978 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 7.1846 + "tflops": 16.4301 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 18.2821 + "tflops": 18.1162 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 15.5474 + "tflops": 23.1662 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 20.3352 + "tflops": 25.2401 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 22.9198 + "tflops": 24.2129 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 14.949 + "tflops": 14.0512 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 14.6586 + "tflops": 14.2238 } ] } @@ -4909,67 +6637,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.6628 + "tflops": 0.7691 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.2668 + "tflops": 1.4641 }, { "batch_size": 4, "kernel": "hybrid_triton_w4a16", - "tflops": 1.7605 + "tflops": 1.8961 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 3.3027 + "tflops": 3.8103 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 6.4611 + "tflops": 7.8169 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 12.7037 + "tflops": 12.4923 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 12.2372 + "tflops": 17.8132 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 18.419 + "tflops": 22.1925 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 21.427 + "tflops": 24.4846 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 19.3701 + "tflops": 28.4128 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 20.9742 + "tflops": 23.7273 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 14.4975 + "tflops": 20.6854 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 15.57 + "tflops": 20.2078 } ] }, @@ -4979,67 +6707,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.6178 + "tflops": 0.7394 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.1842 + "tflops": 1.3852 }, { "batch_size": 4, "kernel": "hybrid_triton_w4a16", - "tflops": 1.6867 + "tflops": 1.8693 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 3.1813 + "tflops": 3.7208 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 6.2884 + "tflops": 7.5705 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 12.054 + "tflops": 12.2195 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 10.3469 + "tflops": 17.6957 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 16.7178 + "tflops": 19.439 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 20.6519 + "tflops": 24.3418 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 18.8697 + "tflops": 28.1145 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 20.7922 + "tflops": 22.772 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 13.2217 + "tflops": 20.4481 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 14.1922 + "tflops": 20.1312 } ] }, @@ -5049,67 +6777,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.664 + "tflops": 0.765 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.2698 + "tflops": 1.4618 }, { "batch_size": 4, "kernel": "hybrid_triton_w4a16", - "tflops": 1.6986 + "tflops": 1.4338 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 3.3525 + "tflops": 2.8637 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 6.5969 + "tflops": 5.696 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 12.6248 + "tflops": 11.0479 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 12.468 + "tflops": 12.126 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 18.9792 + "tflops": 18.5934 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 23.004 + "tflops": 23.7222 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 21.1417 + "tflops": 23.2143 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 22.0724 + "tflops": 23.4174 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 14.2558 + "tflops": 13.0275 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 14.977 + "tflops": 13.5484 } ] }, @@ -5119,67 +6847,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.6384 + "tflops": 0.755 }, { "batch_size": 2, "kernel": "wvsplitk_int4", - "tflops": 1.2171 + "tflops": 1.4297 }, { "batch_size": 4, "kernel": "hybrid_triton_w4a16", - "tflops": 1.5234 + "tflops": 1.4414 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 3.042 + "tflops": 2.8832 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 6.0245 + "tflops": 5.6721 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 11.6611 + "tflops": 11.1268 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 10.7429 + "tflops": 12.0112 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 16.9008 + "tflops": 18.7885 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 20.7589 + "tflops": 23.9202 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 18.9066 + "tflops": 23.5689 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 20.8939 + "tflops": 24.0416 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 13.0903 + "tflops": 14.5579 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 13.9524 + "tflops": 16.3642 } ] } @@ -5197,67 +6925,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.885 + "tflops": 0.8714 }, { "batch_size": 2, "kernel": "hybrid_triton_w4a16", - "tflops": 1.1035 + "tflops": 1.2922 }, { "batch_size": 4, "kernel": "hybrid_triton_w4a16", - "tflops": 2.1331 + "tflops": 2.5672 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 4.1602 + "tflops": 5.0878 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 8.2714 + "tflops": 10.1708 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 15.8243 + "tflops": 16.24 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 17.1115 + "tflops": 22.8203 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 17.2223 + "tflops": 24.8567 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 21.7413 + "tflops": 30.2096 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 23.4659 + "tflops": 32.5576 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 23.6745 + "tflops": 32.8383 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 22.2756 + "tflops": 32.8233 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 22.8942 + "tflops": 32.0518 } ] }, @@ -5267,67 +6995,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.8619 + "tflops": 0.8538 }, { "batch_size": 2, "kernel": "hybrid_triton_w4a16", - "tflops": 1.0915 + "tflops": 1.2904 }, { "batch_size": 4, "kernel": "hybrid_triton_w4a16", - "tflops": 2.069 + "tflops": 2.5751 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 3.943 + "tflops": 5.0972 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 7.8982 + "tflops": 10.0951 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 15.3026 + "tflops": 16.2447 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 15.3981 + "tflops": 22.6096 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 16.8937 + "tflops": 23.2851 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 21.6004 + "tflops": 29.4782 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 22.5615 + "tflops": 24.7926 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 23.6987 + "tflops": 30.8928 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 21.6381 + "tflops": 30.8603 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 21.5786 + "tflops": 28.3234 } ] }, @@ -5337,67 +7065,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.8809 + "tflops": 0.8745 }, { "batch_size": 2, "kernel": "hybrid_triton_w4a16", - "tflops": 1.0253 + "tflops": 0.9612 }, { "batch_size": 4, "kernel": "hybrid_triton_w4a16", - "tflops": 2.0285 + "tflops": 1.9088 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 4.0189 + "tflops": 3.7461 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 7.9354 + "tflops": 7.4315 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 15.4095 + "tflops": 14.8085 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 18.0048 + "tflops": 16.7154 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 17.3915 + "tflops": 16.4115 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 22.9484 + "tflops": 22.2555 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 25.4824 + "tflops": 24.3563 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 25.1519 + "tflops": 24.7497 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 22.6301 + "tflops": 18.4907 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 23.1348 + "tflops": 18.5423 } ] }, @@ -5407,67 +7135,67 @@ { "batch_size": 1, "kernel": "wvsplitk_int4", - "tflops": 0.8576 + "tflops": 0.8561 }, { "batch_size": 2, "kernel": "hybrid_triton_w4a16", - "tflops": 0.9376 + "tflops": 0.9513 }, { "batch_size": 4, "kernel": "hybrid_triton_w4a16", - "tflops": 1.8925 + "tflops": 1.9253 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 3.7333 + "tflops": 3.8034 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 7.3705 + "tflops": 7.4644 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 14.2827 + "tflops": 14.836 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 16.4865 + "tflops": 16.6624 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 16.557 + "tflops": 16.4288 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 21.0001 + "tflops": 21.5939 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 22.8224 + "tflops": 24.1823 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 23.1666 + "tflops": 25.4351 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 22.1723 + "tflops": 19.3491 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 22.0013 + "tflops": 19.2391 } ] } @@ -5485,67 +7213,67 @@ { "batch_size": 1, "kernel": "hybrid_triton_w4a16", - "tflops": 0.424 + "tflops": 0.4042 }, { "batch_size": 2, "kernel": "hybrid_triton_w4a16", - "tflops": 0.8228 + "tflops": 0.8125 }, { "batch_size": 4, "kernel": "hybrid_triton_w4a16", - "tflops": 1.5955 + "tflops": 1.6117 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 3.1065 + "tflops": 3.2004 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 5.4339 + "tflops": 6.1635 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 9.1693 + "tflops": 8.9625 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 6.7792 + "tflops": 15.5008 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 17.9705 + "tflops": 19.7804 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 22.3488 + "tflops": 17.4921 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 20.9475 + "tflops": 24.1397 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 19.9299 + "tflops": 21.4855 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 15.6152 + "tflops": 21.3496 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 16.0634 + "tflops": 18.6901 } ] }, @@ -5555,67 +7283,67 @@ { "batch_size": 1, "kernel": "hybrid_triton_w4a16", - "tflops": 0.3508 + "tflops": 0.4112 }, { "batch_size": 2, "kernel": "hybrid_triton_w4a16", - "tflops": 0.6919 + "tflops": 0.8167 }, { "batch_size": 4, "kernel": "hybrid_triton_w4a16", - "tflops": 1.3579 + "tflops": 1.6275 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 2.568 + "tflops": 3.2131 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 4.5435 + "tflops": 6.0822 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 7.6452 + "tflops": 8.869 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 6.5037 + "tflops": 15.3506 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 17.3892 + "tflops": 19.8616 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 22.0346 + "tflops": 17.6276 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 20.5223 + "tflops": 24.1913 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 19.4163 + "tflops": 20.5437 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 13.7689 + "tflops": 21.0029 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 14.7043 + "tflops": 18.626 } ] }, @@ -5625,67 +7353,67 @@ { "batch_size": 1, "kernel": "hybrid_triton_w4a16", - "tflops": 0.3789 + "tflops": 0.3425 }, { "batch_size": 2, "kernel": "hybrid_triton_w4a16", - "tflops": 0.7366 + "tflops": 0.6843 }, { "batch_size": 4, "kernel": "hybrid_triton_w4a16", - "tflops": 1.4298 + "tflops": 1.3675 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 2.691 + "tflops": 2.7247 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 4.9467 + "tflops": 5.4019 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 8.4458 + "tflops": 9.6212 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 6.9205 + "tflops": 9.2575 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 18.3407 + "tflops": 17.9834 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 24.098 + "tflops": 23.2953 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 22.5525 + "tflops": 22.6332 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 19.3027 + "tflops": 22.8482 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 15.0436 + "tflops": 13.5009 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 15.5594 + "tflops": 13.7953 } ] }, @@ -5695,67 +7423,67 @@ { "batch_size": 1, "kernel": "hybrid_triton_w4a16", - "tflops": 0.3533 + "tflops": 0.3448 }, { "batch_size": 2, "kernel": "hybrid_triton_w4a16", - "tflops": 0.7015 + "tflops": 0.6894 }, { "batch_size": 4, "kernel": "hybrid_triton_w4a16", - "tflops": 1.4 + "tflops": 1.3774 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 2.7728 + "tflops": 2.7439 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 5.4103 + "tflops": 5.4167 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 9.9436 + "tflops": 9.6714 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 6.8438 + "tflops": 9.0974 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 17.0035 + "tflops": 17.9583 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 22.0357 + "tflops": 23.4448 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 20.4016 + "tflops": 22.222 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 19.316 + "tflops": 23.2227 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 13.6848 + "tflops": 14.5741 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 14.0292 + "tflops": 14.378 } ] } @@ -5773,67 +7501,67 @@ { "batch_size": 1, "kernel": "hybrid_triton_w4a16", - "tflops": 0.2254 + "tflops": 0.4983 }, { "batch_size": 2, "kernel": "hybrid_triton_w4a16", - "tflops": 0.4478 + "tflops": 1.0061 }, { "batch_size": 4, "kernel": "hybrid_triton_w4a16", - "tflops": 0.8837 + "tflops": 1.9957 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 1.7114 + "tflops": 3.8952 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 3.3617 + "tflops": 7.2592 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 6.0897 + "tflops": 10.286 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 5.2481 + "tflops": 16.1262 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 13.251 + "tflops": 22.1055 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 17.8496 + "tflops": 20.0295 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 17.0149 + "tflops": 26.1682 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 19.0741 + "tflops": 20.9875 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 13.2297 + "tflops": 19.0018 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 14.7638 + "tflops": 18.9288 } ] }, @@ -5843,67 +7571,67 @@ { "batch_size": 1, "kernel": "hybrid_triton_w4a16", - "tflops": 0.1845 + "tflops": 0.4984 }, { "batch_size": 2, "kernel": "hybrid_triton_w4a16", - "tflops": 0.3661 + "tflops": 0.9939 }, { "batch_size": 4, "kernel": "hybrid_triton_w4a16", - "tflops": 0.7269 + "tflops": 1.9877 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 1.3789 + "tflops": 3.8584 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 2.6411 + "tflops": 7.133 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 4.6507 + "tflops": 9.8464 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 4.5358 + "tflops": 15.8798 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 12.7032 + "tflops": 20.1028 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 16.6265 + "tflops": 19.2948 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 15.7705 + "tflops": 25.3164 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 17.5845 + "tflops": 17.76 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 12.3266 + "tflops": 18.1115 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 13.3552 + "tflops": 18.3809 } ] }, @@ -5913,67 +7641,67 @@ { "batch_size": 1, "kernel": "hybrid_triton_w4a16", - "tflops": 0.2229 + "tflops": 0.3733 }, { "batch_size": 2, "kernel": "hybrid_triton_w4a16", - "tflops": 0.4398 + "tflops": 0.736 }, { "batch_size": 4, "kernel": "hybrid_triton_w4a16", - "tflops": 0.8398 + "tflops": 1.4725 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 1.5984 + "tflops": 2.9242 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 3.0584 + "tflops": 5.6453 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 5.2429 + "tflops": 9.9775 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 5.2641 + "tflops": 9.3509 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 12.7121 + "tflops": 17.9037 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 18.687 + "tflops": 23.5871 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 18.4856 + "tflops": 22.5694 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 18.8484 + "tflops": 21.6068 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 12.7829 + "tflops": 12.0418 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 13.8779 + "tflops": 11.0855 } ] }, @@ -5983,67 +7711,67 @@ { "batch_size": 1, "kernel": "hybrid_triton_w4a16", - "tflops": 0.2218 + "tflops": 0.3787 }, { "batch_size": 2, "kernel": "hybrid_triton_w4a16", - "tflops": 0.4401 + "tflops": 0.7571 }, { "batch_size": 4, "kernel": "hybrid_triton_w4a16", - "tflops": 0.8749 + "tflops": 1.5092 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 1.727 + "tflops": 2.9793 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 3.3501 + "tflops": 5.755 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 6.1865 + "tflops": 10.0986 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 4.6114 + "tflops": 9.1947 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 12.5741 + "tflops": 18.0054 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 16.5046 + "tflops": 23.7634 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 15.7934 + "tflops": 22.8553 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 17.7758 + "tflops": 21.9523 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 11.586 + "tflops": 14.1278 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 12.6513 + "tflops": 15.7979 } ] } @@ -6061,67 +7789,67 @@ { "batch_size": 1, "kernel": "hybrid_triton_w4a16", - "tflops": 0.262 + "tflops": 0.654 }, { "batch_size": 2, "kernel": "hybrid_triton_w4a16", - "tflops": 0.5001 + "tflops": 1.2994 }, { "batch_size": 4, "kernel": "hybrid_triton_w4a16", - "tflops": 0.969 + "tflops": 2.5728 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 2.0626 + "tflops": 5.0223 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 4.0089 + "tflops": 9.8008 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 7.4095 + "tflops": 14.4447 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 6.0759 + "tflops": 21.0946 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 9.6046 + "tflops": 23.0806 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 11.9319 + "tflops": 23.0672 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 18.156 + "tflops": 21.4033 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 21.0747 + "tflops": 23.7258 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 14.6117 + "tflops": 20.6372 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 14.811 + "tflops": 18.5762 } ] }, @@ -6131,67 +7859,67 @@ { "batch_size": 1, "kernel": "hybrid_triton_w4a16", - "tflops": 0.1896 + "tflops": 0.6309 }, { "batch_size": 2, "kernel": "hybrid_triton_w4a16", - "tflops": 0.372 + "tflops": 1.2545 }, { "batch_size": 4, "kernel": "hybrid_triton_w4a16", - "tflops": 0.7141 + "tflops": 2.4986 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 1.4814 + "tflops": 4.9337 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 3.0375 + "tflops": 9.5053 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 5.8263 + "tflops": 13.945 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 5.305 + "tflops": 20.9683 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 10.1078 + "tflops": 18.6754 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 11.497 + "tflops": 23.7676 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 15.3731 + "tflops": 23.2501 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 19.4851 + "tflops": 18.5538 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 13.4057 + "tflops": 17.6012 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 12.9403 + "tflops": 16.0544 } ] }, @@ -6201,67 +7929,67 @@ { "batch_size": 1, "kernel": "hybrid_triton_w4a16", - "tflops": 0.246 + "tflops": 0.4908 }, { "batch_size": 2, "kernel": "hybrid_triton_w4a16", - "tflops": 0.4896 + "tflops": 0.9934 }, { "batch_size": 4, "kernel": "hybrid_triton_w4a16", - "tflops": 0.9348 + "tflops": 1.9817 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 1.9293 + "tflops": 3.914 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 3.877 + "tflops": 7.6158 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 7.2312 + "tflops": 14.5941 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 5.8312 + "tflops": 15.9098 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 14.6063 + "tflops": 16.582 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 12.3716 + "tflops": 23.1917 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 19.6078 + "tflops": 24.032 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 19.7853 + "tflops": 21.3052 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 14.0654 + "tflops": 12.7361 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 14.574 + "tflops": 13.3488 } ] }, @@ -6271,67 +7999,67 @@ { "batch_size": 1, "kernel": "hybrid_triton_w4a16", - "tflops": 0.2389 + "tflops": 0.499 }, { "batch_size": 2, "kernel": "hybrid_triton_w4a16", - "tflops": 0.464 + "tflops": 0.9953 }, { "batch_size": 4, "kernel": "hybrid_triton_w4a16", - "tflops": 0.9057 + "tflops": 1.9836 }, { "batch_size": 8, "kernel": "hybrid_triton_w4a16", - "tflops": 1.8822 + "tflops": 3.9327 }, { "batch_size": 16, "kernel": "hybrid_triton_w4a16", - "tflops": 3.6763 + "tflops": 7.7536 }, { "batch_size": 32, "kernel": "hybrid_triton_w4a16", - "tflops": 6.7128 + "tflops": 14.5516 }, { "batch_size": 64, "kernel": "hybrid_triton_w4a16", - "tflops": 5.0969 + "tflops": 14.7296 }, { "batch_size": 128, "kernel": "hybrid_triton_w4a16", - "tflops": 13.7095 + "tflops": 16.5329 }, { "batch_size": 256, "kernel": "hybrid_triton_w4a16", - "tflops": 10.2957 + "tflops": 23.4375 }, { "batch_size": 512, "kernel": "hybrid_triton_w4a16", - "tflops": 14.9646 + "tflops": 24.7607 }, { "batch_size": 1024, "kernel": "hybrid_triton_w4a16", - "tflops": 19.1935 + "tflops": 21.6051 }, { "batch_size": 2048, "kernel": "hybrid_triton_w4a16", - "tflops": 12.6953 + "tflops": 15.5971 }, { "batch_size": 4096, "kernel": "hybrid_triton_w4a16", - "tflops": 12.7153 + "tflops": 16.041 } ] } diff --git a/tests/kernels/quantization/test_hybrid_w4a16_perf.py b/tests/kernels/quantization/test_hybrid_w4a16_perf.py index ee3138572fc7..c147dadb8092 100644 --- a/tests/kernels/quantization/test_hybrid_w4a16_perf.py +++ b/tests/kernels/quantization/test_hybrid_w4a16_perf.py @@ -217,6 +217,43 @@ def _log_temp(config: Any, label: str) -> float: "group_size": 128, "comment": "L2 2MiB above", }, + { + "in_features": 2048, + "out_features": 4096, + "group_size": 128, + "comment": "Qwen3-1.7B qkv_proj", + }, + { + "in_features": 2048, + "out_features": 12288, + "group_size": 128, + "comment": "Qwen3-1.7B gate_up_proj", + }, + { + "in_features": 6144, + "out_features": 2048, + "group_size": 128, + "comment": "Qwen3-1.7B down_proj", + }, + # RedHatAI/gemma-3-4b-it-quantized.w4a16 (g=128) + { + "in_features": 2560, + "out_features": 4096, + "group_size": 128, + "comment": "Gemma3-4B qkv_proj", + }, + { + "in_features": 2560, + "out_features": 20480, + "group_size": 128, + "comment": "Gemma3-4B gate_up_proj", + }, + { + "in_features": 10240, + "out_features": 2560, + "group_size": 128, + "comment": "Gemma3-4B down_proj", + }, ] # Provider naming convention: "[-zp][-bf16]". Suffix -zp selects the @@ -337,12 +374,18 @@ def prepare_hybrid_weights( Scales/zp follow the activation *dtype* so the kernel exercises the same fp16 vs bf16 code path it takes in production. """ + from vllm.model_executor.kernels.linear.mixed_precision.hybrid_w4a16 import ( + pack_skinny_int4, + ) + num_groups = K // group_size - w_q_skinny_i32 = torch.randint( - 0, 2**31, (N, K // 8), dtype=torch.int32, device=device - ) - w_q_skinny = w_q_skinny_i32.view(torch.int8).contiguous() + # Build the skinny weights via the SAME production helper the layer uses at + # load time, so the benchmark exercises the real memory layout -- including + # the gfx1151 cliff row-pad -- with no duplicated stride logic here. Values + # are random (irrelevant to timing); only shape/stride/dtype matter. + unpacked = torch.randint(0, 16, (N, K), dtype=torch.int32, device=device) + w_q_skinny, w_q_skinny_i32 = pack_skinny_int4(unpacked) w_s_skinny = torch.randn(N, num_groups, dtype=dtype, device=device) * 0.01 w_zp = torch.randint(0, 16, (N, num_groups), dtype=torch.int32, device=device).to( dtype @@ -356,6 +399,34 @@ def prepare_hybrid_weights( } +def _compute_packed_scale_zp( + w_s: torch.Tensor, + w_zp: torch.Tensor | None, + dtype: torch.dtype, +) -> torch.Tensor | None: + """Pack the per-group packed scale/zp carrier into one fp32, matching the load-time + carrier built in ``HybridW4A16LinearKernel.process_weights_after_loading``. + + Built ONLY for asymmetric layers (``w_zp`` given); returns None for symmetric + (the kernel uses the constant -8 offset there, no carrier). Low 16 bits = + scale; high 16 bits are dtype-specific: + fp16: bias_eff = -(8*scale + (zp-8)*scale) — magic-constant fp16 FMA dequant. + bf16: zp_int = raw zp 0..15 — int-domain subtract (bit-identical to the + separate scale + zp loads). + """ + if w_zp is None or dtype not in (torch.float16, torch.bfloat16): + return None + scale_u16 = w_s.view(torch.uint16).to(torch.int32) & 0xFFFF + if dtype == torch.float16: + w_s_f32 = w_s.to(torch.float32) + scaled_zp_f32 = (w_zp.to(torch.float32) - 8.0) * w_s_f32 + bias_eff = (-(8.0 * w_s_f32 + scaled_zp_f32)).to(dtype) + hi_u16 = bias_eff.contiguous().view(torch.uint16).to(torch.int32) & 0xFFFF + else: + hi_u16 = w_zp.to(torch.int32) & 0xFFFF + return ((hi_u16 << 16) | scale_u16).view(torch.float32).contiguous() + + # --------------------------------------------------------------------------- # Core measurement # --------------------------------------------------------------------------- @@ -379,6 +450,25 @@ def _cool_down(config: Any, test_id: str) -> None: _log_temp(config, f"{test_id}:post-sleep") +# Rotate the weight operand through this many MiB so a revisited buffer has been +# evicted from the gfx1151 32 MiB MALL -- yields cold-weight measurements (as in +# a real forward pass, where each weight is read once and evicted before its next +# use) instead of the hot-MALL numbers a single-buffer cudagraph would report. +_ROTATE_TARGET_BYTES = 48 << 20 +_ROTATE_MAX = 32 + + +def _clone_strided(t: torch.Tensor | None) -> torch.Tensor | None: + """Clone preserving exact size AND stride (the K%2048 cliff workaround uses + row-padded weight strides; a plain .clone() would compact them and change + the measured kernel). None passes through (symmetric path has no carrier).""" + if t is None: + return None + c = torch.empty_strided(t.size(), t.stride(), dtype=t.dtype, device=t.device) + c.copy_(t) + return c + + def measure_tflops( M: int, weights: dict[str, torch.Tensor], @@ -387,7 +477,13 @@ def measure_tflops( group_size: int, provider: str, ) -> tuple[str, float]: - """Run the kernel and return (kernel label, median TFLOP/s).""" + """Run the kernel and return (kernel label, median TFLOP/s). + + The weight operand is rotated through enough copies to exceed the MALL so + each captured call reads cold weights; the activation stays single-buffered + (hot), matching a real prefill where activations are freshly produced and + weights stream from HBM. + """ from vllm.model_executor.kernels.linear.mixed_precision.hybrid_w4a16 import ( _hybrid_w4a16_apply_impl, ) @@ -400,17 +496,47 @@ def measure_tflops( cu_count = num_compute_units() use_zp = _provider_use_zp(provider) + # packing zero point and scales for faster access + packed_scale_zp = _compute_packed_scale_zp( + weights["w_s_skinny"], weights["w_zp"] if use_zp else None, dtype + ) + + # Build N rotating copies of the (cold) weight operands; the dominant read is + # w_q_skinny_i32 (N*K/2 bytes), so size the rotation off it. run() advances a + # buffer index per call; do_bench_cudagraph unrolls n_repeat = rep/est calls + # into the graph, so at small M (tiny est) n_repeat far exceeds n_buf and the + # rotation is complete -- exactly where cache residency matters. At large M + # the kernel is compute-bound (cache-insensitive), so partial rotation there + # is immaterial. + w_bytes = ( + weights["w_q_skinny_i32"].numel() * weights["w_q_skinny_i32"].element_size() + ) + n_buf = max(2, min(_ROTATE_MAX, -(-_ROTATE_TARGET_BYTES // w_bytes))) + bufs = [ + { + "w_q_skinny": _clone_strided(weights["w_q_skinny"]), + "w_s_skinny": _clone_strided(weights["w_s_skinny"]), + "w_q_skinny_i32": _clone_strided(weights["w_q_skinny_i32"]), + "w_zp": _clone_strided(weights["w_zp"]) if use_zp else None, + "packed_scale_zp": _clone_strided(packed_scale_zp), + } + for _ in range(n_buf) + ] + idx = [0] def run(): + w = bufs[idx[0] % n_buf] + idx[0] += 1 return _hybrid_w4a16_apply_impl( a, - weights["w_q_skinny"], - weights["w_s_skinny"], - weights["w_q_skinny_i32"], - weights["w_zp"] if use_zp else None, + w["w_q_skinny"], + w["w_s_skinny"], + w["w_q_skinny_i32"], + w["w_zp"], None, # bias cu_count, group_size, + w["packed_scale_zp"], ) ms = triton.testing.do_bench_cudagraph(run, quantiles=[0.5]) diff --git a/tests/kernels/quantization/test_hybrid_w4a16_triton.py b/tests/kernels/quantization/test_hybrid_w4a16_triton.py index ffef412318c8..d6cbaa629c00 100644 --- a/tests/kernels/quantization/test_hybrid_w4a16_triton.py +++ b/tests/kernels/quantization/test_hybrid_w4a16_triton.py @@ -105,7 +105,6 @@ def test_triton_w4a16_skinny_fmt_gemm_matches_reference( b_q=b_packed, scales=scales, group_size=G, - zp_bias=8, ) ref = _w4a16_skinny_reference( a, @@ -146,6 +145,24 @@ def _w4a16_skinny_reference_asymmetric( return out.to(a_mk.dtype) +def _pack_scale_zp( + scales_nkg: torch.Tensor, zp_nkg: torch.Tensor, dtype: torch.dtype +) -> torch.Tensor: + """Build the asymmetric PackedSb carrier [N, K//G] fp32 that + triton_w4a16_skinny_fmt_gemm consumes (low 16 bits = scale; high 16 bits = + fp16 bias_eff = -(8 + (zp-8))*scale, or bf16 integer zp). Mirrors + HybridW4A16LinearKernel.process_weights_after_loading. + """ + scale_u16 = scales_nkg.contiguous().view(torch.uint16).to(torch.int32) & 0xFFFF + if dtype == torch.float16: + s32 = scales_nkg.to(torch.float32) + bias_eff = (-(8.0 * s32 + (zp_nkg.to(torch.float32) - 8.0) * s32)).to(dtype) + hi_u16 = bias_eff.contiguous().view(torch.uint16).to(torch.int32) & 0xFFFF + else: + hi_u16 = zp_nkg.to(torch.int32) & 0xFFFF + return ((hi_u16 << 16) | scale_u16).view(torch.float32).contiguous() + + @pytest.mark.skipif(not current_platform.is_rocm(), reason="ROCm only") @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) @pytest.mark.parametrize( @@ -185,7 +202,7 @@ def test_triton_w4a16_skinny_fmt_gemm_asymmetric(dtype, M, K, N, G, random_seed: b_q=b_packed, scales=scales, group_size=G, - zp=zp, + packed_scale_zp=_pack_scale_zp(scales, zp, dtype), ) ref = _w4a16_skinny_reference_asymmetric( a, diff --git a/vllm/model_executor/kernels/linear/mixed_precision/hybrid_w4a16.py b/vllm/model_executor/kernels/linear/mixed_precision/hybrid_w4a16.py index 532ac1547ea0..ece176bf329d 100644 --- a/vllm/model_executor/kernels/linear/mixed_precision/hybrid_w4a16.py +++ b/vllm/model_executor/kernels/linear/mixed_precision/hybrid_w4a16.py @@ -44,13 +44,46 @@ # --------------------------------------------------------------------------- +@tl.target_info.constexpr_function +def _target_is_gfx1x() -> bool: + """Compile-time True on RDNA gfx11/gfx12 (where the v_and_or_b32 packed + dequant is validated/tuned).""" + target = tl.target_info.current_target() + if target is None or target.backend != "hip": + return False + arch = str(target.arch) + return arch.startswith("gfx11") or arch.startswith("gfx12") + + +@triton.jit +def _int4_pair_to_fp16x2(x): + """Unpack two packed int4 nibbles into a uint32 holding two fp16 lanes, + each equal to 1024 + nibble, with one ``v_and_or_b32`` + (``(x & 0x000F000F) | 0x64006400``). + + OR-ing a 4-bit nibble into the low mantissa of fp16 1024.0 (0x6400) + bitcasts to exactly 1024+n (CK's i4_to_half trick). Doing it on a full + 32-bit lane dequants two nibbles per instruction, vs the scalar + v_and_b16 + v_or_b16 pair Triton emits from the elementwise form. + """ + mask = tl.full(x.shape, 0x000F000F, tl.int32) + return tl.inline_asm_elementwise( + asm="v_and_or_b32 $0, $1, $2, 0x64006400", + constraints="=v,v,v", + args=[x, mask], + dtype=tl.uint32, + is_pure=True, + pack=1, + ) + + @triton.jit def _triton_w4a16_skinny_fmt_kernel( # Pointers a_ptr, # [M, K] fp16/bf16 activations b_ptr, # [N, K//8] int32 packed (ExLlama shuffle, K is packed dim) - scales_ptr, # [N, K//G] fp16/bf16 scales (skinny layout) - zp_ptr, # [N, K//G] fp16/bf16 raw zero-points (when HAS_ZP=True) + scales_ptr, # [N, K//G] fp16/bf16 scales (sym path, HAS_ZP=False) + packed_scale_zp_ptr, # [N, K//G] int32 scale/zp carrier (asym, HAS_ZP) c_ptr, # [M, N] fp16/bf16 output # Dimensions M, @@ -59,27 +92,32 @@ def _triton_w4a16_skinny_fmt_kernel( K8, # K // 8 stride_bn, # per-row stride of b_ptr (in int32 elements) num_groups, # K // group_size - # Quantization parameters group_size, - ZP_BIAS: tl.constexpr, - HAS_ZP: tl.constexpr, + HAS_ZP: tl.constexpr, # asym: read the scale/zp carrier; sym: scales + (-8) # Block sizes BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, ): """ - Fused W4A16 GEMM reading weights from skinny format [N, K//8]. + Fused W4A16 GEMM reading skinny weights [N, K//8]. B is stored as [N, K//8] int32 using ExLlama shuffle packing: each int32 packs 8 K-values with interleave [0,2,4,6,1,3,5,7]: packed = val[0] | (val[2]<<4) | (val[4]<<8) | (val[6]<<12) | (val[1]<<16) | (val[3]<<20) | (val[5]<<24) | (val[7]<<28) - Scales are [N, K//G] (skinny layout, NOT transposed). - When HAS_ZP=True, raw zero-points zp_raw are loaded from zp_ptr [N, K//G] - and subtracted directly: (nibble - zp_raw) * scale. - When HAS_ZP=False, only the constant ZP_BIAS is subtracted (symmetric). + Two dequant paths, chosen at the layer's sym/asym nature: + - HAS_ZP=True (asymmetric): read the carrier ``packed_scale_zp_ptr`` + [N, K//G] (one fp32 per (n, group)) — it folds the per-group scale AND + the zero-point offset into a single load, replacing the separate scale + + zp loads. Layout: fp16 = scale | bias_eff (= -8*scale - scaled_zp), + dequant (nibble-1024)*scale + bias_eff via the magic-const fp16 unpack; + bf16 = scale | zp_int, dequant (nibble - zp_int)*scale. + - HAS_ZP=False (symmetric): the -8 offset is a constant, so there is + no second load to fold — read ``scales_ptr`` directly and subtract the + constant 8. fp16: (nibble - 1032)*scale via the magic unpack; bf16: + (nibble - 8)*scale. (No carrier overhead for the sym fast path.) """ pid_m = tl.program_id(0) pid_n = tl.program_id(1) @@ -115,43 +153,85 @@ def _triton_w4a16_skinny_fmt_kernel( mask_b = (offs_n[:, None] < N) & (offs_k8[None, :] < K8) b_packed = tl.load(b_ptrs, mask=mask_b, other=0) - # ---- Unpack int4 weights with ExLlama unshuffle ---- - b = tl.interleave(b_packed, b_packed) - b = tl.interleave(b, b) - b = tl.interleave(b, b) - b = (b >> shifts_full) & 0xF # [BLOCK_N, BLOCK_K] + # ---- Unpack int4 weights ---- + # The packed v_and_or_b32 / v_pk_fma dequant is fp16-only (the 1024+n + # magic trick needs fp16's mantissa) and only validated/tuned on RDNA + # gfx11/gfx12. Decided here at compile time (dtype + target arch) so + # callers pass no flag; everything else uses the scalar unpack. The + # condition is written inline (not via a local) so Triton constexpr- + # eliminates the dead arm — the per-dtype vars below are only defined + # on the taken path. + if (a.dtype == tl.float16) and _target_is_gfx1x(): + # Packed dequant (fp16). The ExLlama int32 holds the + # paired nibbles val[2p] @ bits[4p:4p+4] and val[2p+1] @ + # bits[16+4p:20+4p], so for pre-shift 4p (p=0..3), + # (x >> 4p) & 0x000F000F | 0x64006400 + # is one v_and_or_b32 producing a half2 = (1024+val[2p], + # 1024+val[2p+1]) in K order (signed shift is fine: the sign fill + # lands above bit 20, masked out). This dequants TWO nibbles per + # instruction; the elementwise form lowers to scalar v_and_b16 + + # v_or_b16 (1 nibble each). The interleave(lo, hi) lays b_raw out as + # half2 so the downstream affine also packs into v_pk_fma_f16. The + # dequant inner loop is VALU-issue-bound on gfx11, so this ~halves + # the dequant instruction count per WMMA and matches CK. + shifts4 = (tl.arange(0, 4) * 4)[None, None, :] + bp_shift = tl.reshape( + b_packed[:, :, None] >> shifts4, (BLOCK_N, BLOCK_K // 2) + ) + packed_hl = _int4_pair_to_fp16x2(bp_shift) # u32 half2: 1024+nibble pair + lo = (packed_hl & 0xFFFF).to(tl.uint16).to(tl.float16, bitcast=True) + hi = (packed_hl >> 16).to(tl.uint16).to(tl.float16, bitcast=True) + b_raw = tl.interleave(lo, hi) # [BLOCK_N, BLOCK_K] fp16 = 1024+nibble + else: + # ExLlama unshuffle: replicate each int32 8x then per-lane shift+mask. + b = tl.interleave(b_packed, b_packed) + b = tl.interleave(b, b) + b = tl.interleave(b, b) + b = (b >> shifts_full) & 0xF # [BLOCK_N, BLOCK_K] - # ---- Load scales from [N, K//G] layout ---- + # ---- Per-group quant params from [N, K//G] layout ---- g_idx = (k_start * BLOCK_K) // group_size - scale_ptrs = scales_ptr + offs_n * num_groups + g_idx scale_mask = offs_n < N - scales = tl.load(scale_ptrs, mask=scale_mask, other=1.0) # ---- Dequantize ---- if HAS_ZP: - zp_ptrs = zp_ptr + offs_n * num_groups + g_idx - zp_raw = tl.load(zp_ptrs, mask=scale_mask, other=0.0) - if scales.dtype == tl.bfloat16: - # bf16: subtract zp in INT (zp values are 0..15, exact - # roundtrip), then cast once. This mirrors the symmetric - # path and avoids the per-tile int->bf16 cast of the full - # [BLOCK_N, BLOCK_K] nibble block, which is the bottleneck - # for asymmetric w4a16 bf16 prefill on RDNA3.5 (Strix Halo, - # gfx1151). Casting zp_raw (only BLOCK_N elements) to int - # is cheap. Recovers ~14% TFLOPS across Qwen3-8B w4a16 - # prefill projections without changing fp16 behavior. - zp_int = zp_raw.to(b.dtype) - b_fp = (b - zp_int[:, None]).to(scales.dtype) * scales[:, None] + # Asymmetric: packed scale/zp carrier (one fp32/group folds scale + zp). + psz = tl.load( + packed_scale_zp_ptr + offs_n * num_groups + g_idx, + mask=scale_mask, + other=0, + ) + psz_u = psz.to(tl.uint32, bitcast=True) + if a.dtype == tl.float16: + # fp16: low16 = scale, high16 = bias_eff (= -8*scale - scaled_zp). + # ONE fp16 FMA per group via the magic-constant i4->fp16 unpack. + scale = (psz_u & 0xFFFF).to(tl.uint16).to(tl.float16, bitcast=True) + bias_eff = (psz_u >> 16).to(tl.uint16).to(tl.float16, bitcast=True) + if not _target_is_gfx1x(): + b_raw = (b | 0x6400).to(tl.uint16).to(tl.float16, bitcast=True) + c1024 = tl.full((), 1024.0, tl.float16) + b_fp = (b_raw - c1024) * scale[:, None] + bias_eff[:, None] else: - # fp16: original asymmetric path. The int->fp16 cast on - # RDNA3.5 has a direct ISA path and fuses well with the - # subsequent subtraction, so keeping the cast-first order - # avoids a small fp16 regression observed when switching - # both dtypes to the int-subtract-first form. - b_fp = (b.to(scales.dtype) - zp_raw[:, None]) * scales[:, None] + # bf16: low16 = scale, high16 = zp_int. Cheap int-domain subtract + # before the single bf16 multiply (RDNA3 has no v_pk_fma_bf16). + scale = (psz_u & 0xFFFF).to(tl.uint16).to(tl.bfloat16, bitcast=True) + zp_int = ((psz_u >> 16) & 0xFFFF).to(b.dtype) + b_fp = (b - zp_int[:, None]).to(scale.dtype) * scale[:, None] else: - # Symmetric: (w - 8) * scale - b_fp = (b - ZP_BIAS).to(scales.dtype) * scales[:, None] + # Symmetric: the -8 offset is constant (no zp to fold), so read the + # scale directly — no carrier overhead. + scales = tl.load( + scales_ptr + offs_n * num_groups + g_idx, mask=scale_mask, other=1.0 + ) + if a.dtype == tl.float16: + # (nibble - 8) * scale == (b_raw - (1024+8)) * scale, via magic. + if not _target_is_gfx1x(): + b_raw = (b | 0x6400).to(tl.uint16).to(tl.float16, bitcast=True) + c = tl.full((), float(1024 + 8), tl.float16) + b_fp = (b_raw - c) * scales[:, None] + else: + # bf16: (nibble - 8) * scale, int subtract before the cast. + b_fp = (b - 8).to(scales.dtype) * scales[:, None] # ---- Transpose to [BLOCK_K, BLOCK_N] for matmul ---- b_fp_t = tl.trans(b_fp) @@ -166,26 +246,126 @@ def _triton_w4a16_skinny_fmt_kernel( tl.store(c_ptrs, c, mask=mask_c) +# Explicit gfx11 prefill tile selection -- DTYPE-AWARE. The kernel takes the +# packed v_and_or/v_pk_fma dequant for fp16 and the scalar dequant for bf16, and +# the two paths want different tiles (most visibly BLOCK_N at deep M: 256 for +# packed fp16 vs 64 for scalar bf16). Both were validated under do_bench_cudagraph +# with rotating cold weights over the F.2 catalog. +# +# fp16 (packed) -- tuned table (weights are row-padded as in production, so the +# K%4096 global-load cliff is already dodged and needs no special-casing): +# * M <= 16: BLOCK_M=16 (more M-tiles fill the 40 CUs at tiny M). +# * 17..64: BLOCK_M=32; small BLOCK_N keeps the grid large (a wide BLOCK_N +# leaves only ceil(N/BN) workgroups -- the M-blind BLOCK_N=256 was a 1.6-3x +# regression here). Square mid shapes take BLOCK_N=128/BK=64. +# * 65..256: square -> BLOCK_N=128 (BK=32 nw=8 at M>=128); tall -> BLOCK_N=128. +# * 257..2047: the wide distilled BLOCK_N=256/BLOCK_M=128 tile. +# * M >= 2048: distilled BLOCK_N=256; BLOCK_M=64 for narrow+deep K (N<=2048 and +# K>=4096), else 128. +13-50% over gfx11. +# +# bf16 (scalar) -- reuse the pre-packed-kernel ("gfx11") scalar-tuned table. The +# bf16 kernel body is byte-identical to that kernel, so this keeps bf16 at gfx11 +# parity (the packed fp16 table regresses bf16 by up to ~40% at deep M, where +# scalar bf16 wants BLOCK_N=64, not 256). +# +# BLOCK_K is capped to group_size so a K-block never straddles a quant group +# (scale aliasing); gs128 -- the bulk -- passes the table BLOCK_K through. +def _select_skinny_gfx11_config( + M: int, N: int, K: int, group_size: int, dtype: torch.dtype +) -> tuple[int, int, int, int]: + """Return (BLOCK_M, BLOCK_N, BLOCK_K, num_warps) for the gfx11 skinny GEMM.""" + if dtype == torch.float16: + # Packed-dequant path (fp16 on gfx1x). Cold-optimal tiers from a broad + # rotating-buffer cudagraph sweep over the catalog (weights row-padded + # exactly as production via pack_skinny_int4, so the K%4096 cliff is + # already dodged -- no cliff special-casing needed here). + tall = K >= 2 * N # tall-K (down_proj-like) + if M <= 16: # BM=16: more M-tiles fill the CUs at tiny M + block_m, block_n, block_k, num_warps = 16, 64, 128, 4 + elif M <= 32: + block_m, block_n, block_k, num_warps = 32, 64, 128, 4 + elif M <= 64: + if tall or N >= 4 * K: # tall or very wide + block_m, block_n, block_k, num_warps = 32, 64, 128, 4 + else: # square mid + block_m, block_n, block_k, num_warps = 32, 128, 64, 4 + elif M <= 128: + if tall: + block_m, block_n, block_k, num_warps = 32, 128, 64, 4 + elif N >= 16384: # very wide N + block_m, block_n, block_k, num_warps = 128, 128, 32, 8 + else: # square + block_m, block_n, block_k, num_warps = 64, 128, 32, 4 + elif M <= 256: + block_m, block_n, block_k, num_warps = 128, 128, 32, 8 + elif M < 2048: # 257..2047 (mostly 512, 1024): wide distilled tile + block_m, block_n, block_k, num_warps = 128, 256, 32, 8 + else: # M >= 2048 (deep prefill) + if N <= 2048 and K >= 4096: # narrow + deep: halved BLOCK_M saturates + block_m, block_n, block_k, num_warps = 64, 256, 32, 8 + else: + block_m, block_n, block_k, num_warps = 128, 256, 32, 8 + # Very narrow N (e.g. L2 N=512 microbench shapes) at small/mid M: a wide + # BLOCK_N leaves too few N-tiles to fill the CUs, so clamp it. At M>=1024 + # the M-tiles already saturate, so the wide tile is kept. + if N <= 1024 and M <= 512: + block_n = min(block_n, 32) + else: + # Scalar-dequant path (bf16): pre-packed-kernel scalar-tuned table. + if M <= 32: + block_m, block_n, block_k, num_warps = 32, 32, 128, 4 + elif M <= 64: + block_m, block_n, block_k, num_warps = 64, 64, 32, 4 + elif M <= 128: + if K >= 4096 and N >= 4096: + block_m, block_n, block_k, num_warps = 64, 32, 128, 4 + elif K >= 2 * N: # tall K (down_proj) + block_m, block_n, block_k, num_warps = 64, 16, 64, 1 + elif N > K: # wide N (qkv / gate_up) + block_m, block_n, block_k, num_warps = 64, 64, 64, 4 + else: # N ~= K (o_proj) + block_m, block_n, block_k, num_warps = 64, 32, 64, 4 + elif M <= 1024: + if K >= 2 * N: # tall K (down_proj) + block_m, block_n, block_k, num_warps = 64, 64, 64, 4 + elif N >= 4 * K: # very wide N (gate_up) + block_m, block_n, block_k, num_warps = 128, 64, 64, 8 + else: + block_m, block_n, block_k, num_warps = 64, 128, 32, 4 + else: # M > 1024 + if K >= 2 * N: # tall K (down_proj) + block_m, block_n, block_k, num_warps = 128, 512, 32, 16 + else: + block_m, block_n, block_k, num_warps = 128, 64, 64, 8 + return block_m, block_n, min(block_k, group_size), num_warps + + def triton_w4a16_skinny_fmt_gemm( a: torch.Tensor, # [M, K] fp16/bf16 b_q: torch.Tensor, # [N, K//8] int32 (ExLlama shuffle packed) - scales: torch.Tensor, # [N, K//G] fp16/bf16 + scales: torch.Tensor, # [N, K//G] fp16/bf16 (used for the symmetric path) group_size: int, - zp_bias: int = 8, - zp: torch.Tensor | None = None, # [N, K//G] per-group zero-points + out: torch.Tensor | None = None, # [M, N] optional pre-allocated output + packed_scale_zp: torch.Tensor | None = None, # [N, K//G] fp32 carrier (asym only) ) -> torch.Tensor: """ - Fused W4A16 GEMM reading from skinny weight format [N, K//8]. + Fused W4A16 GEMM reading skinny weights [N, K//8]. + + Asymmetric layers pass ``packed_scale_zp`` (the carrier that folds scale + + zero-point into one load); symmetric layers leave it None and the kernel + reads ``scales`` directly with a constant -8 offset (no carrier overhead — + sym has no second load to fold). Args: a: Activation matrix [M, K], float16 or bfloat16. b_q: Packed weight matrix [N, K//8], int32 (ExLlama shuffle). - scales: Per-group scales [N, K//G], same dtype as a. + scales: Per-group scales [N, K//G], same dtype as a (symmetric path). group_size: Quantization group size (resolved from -1 to K by caller). - zp_bias: Constant zero bias (default 8 for unsigned int4). - zp: Raw per-group zero-points [N, K//G] (asymmetric), - stored as zp_raw in activation dtype. When provided, - dequant is (nibble - zp_raw) * scale. + out: Optional pre-allocated [M, N] output. + packed_scale_zp: Optional packed scale/zp carrier [N, K//G] fp32 for asymmetric + layers; layout is dtype-specific (fp16: scale|bias_eff; bf16: + scale|zp_int) — see the kernel docstring. When None, the + symmetric path is used. Returns: Output matrix [M, N], same dtype as a. @@ -206,14 +386,58 @@ def triton_w4a16_skinny_fmt_gemm( assert scales.shape == (N, num_groups), ( f"scales shape mismatch: {scales.shape} vs ({N}, {num_groups})" ) - if zp is not None: - assert zp.is_contiguous(), "Zero-points must be contiguous" - assert zp.shape == (N, num_groups), ( - f"zp shape mismatch: {zp.shape} vs ({N}, {num_groups})" + has_zp = packed_scale_zp is not None + if packed_scale_zp is not None: + assert packed_scale_zp.is_contiguous(), "packed_scale_zp must be contiguous" + assert packed_scale_zp.shape == (N, num_groups), ( + f"packed_scale_zp shape mismatch: {packed_scale_zp.shape} " + f"vs ({N}, {num_groups})" ) - has_zp = zp is not None + packed_scale_zp_i32 = packed_scale_zp.view(torch.int32) + else: + packed_scale_zp_i32 = scales # dummy pointer (unused when HAS_ZP=False) - c = torch.empty((M, N), dtype=a.dtype, device=a.device) + if out is None: + c = torch.empty((M, N), dtype=a.dtype, device=a.device) + else: + assert out.shape == (M, N), f"out shape mismatch: {out.shape} vs ({M}, {N})" + assert out.dtype == a.dtype, f"out dtype mismatch: {out.dtype} vs {a.dtype}" + assert out.device == a.device, ( + f"out device mismatch: {out.device} vs {a.device}" + ) + assert out.is_contiguous(), "out must be contiguous" + c = out + + # On gfx11x, select the tile config per (M, N, K) from the per-M table + # (see _select_skinny_gfx11_config). BLOCK_K is capped to group_size there + # so a K-block never straddles a quant group (no scale aliasing). + if on_gfx1x(): + block_m, block_n, block_k, num_warps = _select_skinny_gfx11_config( + M, N, K, group_size, a.dtype + ) + grid = (triton.cdiv(M, block_m), triton.cdiv(N, block_n)) + # The kernel picks the packed (fp16/gfx1x) vs scalar unpack itself. + _triton_w4a16_skinny_fmt_kernel[grid]( + a, + b_q, + scales, + packed_scale_zp_i32, + c, + M, + N, + K, + K8, + stride_bn, + num_groups, + group_size=group_size, + HAS_ZP=has_zp, + BLOCK_M=block_m, + BLOCK_N=block_n, + BLOCK_K=block_k, + num_warps=num_warps, + num_stages=1, # >1 regresses badly for this kernel (no SW pipeline) + ) + return c # AMD-specific scheduling hint; only consumed by the HIP backend below # (see compiler.py amdgpu-waves-per-eu attribute). Set to 0 by default @@ -254,55 +478,6 @@ def triton_w4a16_skinny_fmt_gemm( BLOCK_M, BLOCK_N, BLOCK_K, num_warps = 256, 64, 64, 8 else: BLOCK_M, BLOCK_N, BLOCK_K, num_warps = 128, 128, 32, 8 - elif on_gfx1x(): - # Tuned on gfx1151 (Strix Halo, 40 CUs, 32-wide wavefronts) - # using Qwen3-4B weight shapes with group_size=128. - # waves_per_eu=0 means no constraint; specific values pin LLVM - # to a target VGPR budget per occupancy.md (gfx1151 has 1536 - # VGPRs/SIMD; waves_per_eu=N sets max VGPRs to ~1536/N). - if M <= 32: - BLOCK_M, BLOCK_N, BLOCK_K, num_warps = 32, 32, 128, 4 - elif M <= 64: - BLOCK_M, BLOCK_N, BLOCK_K, num_warps = 64, 64, 32, 4 - elif M <= 128: - # For K >= 4096 AND N >= 4096, a single config (BN=32, BK=128, - # NW=4) wins on every projection shape across Qwen3-8B and - # Llama-3.1-8B (down/qkv/gate_up/o_proj all gain +23%..+35% vs - # prior shape-specific configs). The wider K-tile escapes WMMA - # latency-bound regime (wmma.md: >= 2 waves/SIMD), and BN=32 - # keeps the workgroup grid large enough to saturate 40 CUs even - # at N up to ~28k. - # - # Small-N or small-K shapes (Qwen3-VL-4B / Qwen3-4B) need the - # legacy shape-specific configs — at N=2560 the BN=32 grid drops - # below the saturation point. - if K >= 4096 and N >= 4096: - BLOCK_M, BLOCK_N, BLOCK_K, num_warps = 64, 32, 128, 4 - # waves_per_eu=6 matches the natural VGPR-bound occupancy - # but explicitly pinning the target gives LLVM a single - # register count to optimize against (compiler.py: "forces - # LLVM to focus on a single register count, simplifies some - # heuristics and may improve scheduling"). +5-8% across all - # 4 K=N=4096 projection shapes. - waves_per_eu = 6 - elif K >= 2 * N: # tall K, small-N down (e.g. Qwen3-VL-4B down) - BLOCK_M, BLOCK_N, BLOCK_K, num_warps = 64, 16, 64, 1 - elif N > K: # wide N, small K (e.g. Qwen3-VL-4B qkv/gate_up) - BLOCK_M, BLOCK_N, BLOCK_K, num_warps = 64, 64, 64, 4 - else: # N ~= K, small K (e.g. Qwen3-VL-4B o_proj) - BLOCK_M, BLOCK_N, BLOCK_K, num_warps = 64, 32, 64, 4 - elif M <= 1024: - if K >= 2 * N: # tall K (e.g. down_proj) - BLOCK_M, BLOCK_N, BLOCK_K, num_warps = 64, 64, 64, 4 - elif N >= 4 * K: # very wide N (e.g. gate_up_proj) - BLOCK_M, BLOCK_N, BLOCK_K, num_warps = 128, 64, 64, 8 - else: - BLOCK_M, BLOCK_N, BLOCK_K, num_warps = 64, 128, 32, 4 - else: - if K >= 2 * N: # tall K (e.g. down_proj) - BLOCK_M, BLOCK_N, BLOCK_K, num_warps = 128, 512, 32, 16 - else: - BLOCK_M, BLOCK_N, BLOCK_K, num_warps = 128, 64, 64, 8 else: num_warps = 4 if M <= 32: @@ -323,7 +498,7 @@ def triton_w4a16_skinny_fmt_gemm( a, b_q, scales, - zp if has_zp else scales, # dummy pointer when no zp (unused) + packed_scale_zp_i32, c, M, N, @@ -332,7 +507,6 @@ def triton_w4a16_skinny_fmt_gemm( stride_bn, num_groups, group_size=group_size, - ZP_BIAS=zp_bias, HAS_ZP=has_zp, BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N, @@ -368,6 +542,38 @@ def pack_int4_exllama_shuffle(w_uint4: torch.Tensor) -> torch.Tensor: ) +def pack_skinny_int4(unpacked: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + """Pack [N, K] uint4 into the skinny weight layout the kernels consume. + + Single source of truth for the skinny weight memory layout: ExLlama shuffle + to [N, K//8] int32, then -- on gfx1151 only, when K_packed (= K/2 bytes) is a + multiple of 2048 -- pad each row by +32 int32 (+128 B). That pad makes the + GEMM read a non-power-of-2 row stride, dodging the multi-row global-load + cliff (channel/MALL hash collisions). Used by both + ``process_weights_after_loading`` and the perf benchmark so the benchmark can + never drift from the production stride. + + Returns ``(w_q_skinny, w_q_skinny_i32)``: an int8 view for the HIP skinny + kernel and the int32 view for the Triton kernel; both share stride(0). + """ + shuffled = pack_int4_exllama_shuffle(unpacked) + n_rows, k8 = shuffled.shape + k_packed_bytes = k8 * 4 # int32 -> bytes + pad_int32 = 32 # +128 B = one cache line, keeps each row cache-line aligned + if on_gfx1151() and k_packed_bytes % 2048 == 0 and shuffled.device.type == "cuda": + padded = torch.empty( + (n_rows, k8 + pad_int32), dtype=torch.int32, device=shuffled.device + ) + padded[:, :k8].copy_(shuffled) + # Both views inherit stride(0) = k8 + pad_int32. + w_q_skinny_i32 = padded[:, :k8] + w_q_skinny = padded.view(torch.int8)[:, : k8 * 4] + else: + w_q_skinny_i32 = shuffled.contiguous() + w_q_skinny = w_q_skinny_i32.view(torch.int8) + return w_q_skinny, w_q_skinny_i32 + + # --------------------------------------------------------------------------- # Hybrid dispatch logic # --------------------------------------------------------------------------- @@ -382,6 +588,7 @@ def _hybrid_w4a16_apply_impl( bias: torch.Tensor | None, cu_count: int, group_size: int, + packed_scale_zp: torch.Tensor | None = None, ) -> torch.Tensor: """Dispatch between skinny GEMM and Triton based on batch size M. @@ -392,6 +599,8 @@ def _hybrid_w4a16_apply_impl( w_zp: [N, K//G] raw zero-points (zp_raw) in act dtype, or None for symmetric. Both HIP skinny and Triton use this single format: dequant = (nibble - zp_raw) * scale. + packed_scale_zp: [N, K//G] fp32 carrier packing scale + zero-point per + group (Triton prefill, asymmetric only), or None for symmetric. Registered as a custom op so torch.compile treats it as opaque. """ @@ -418,12 +627,15 @@ def _hybrid_w4a16_apply_impl( else torch.profiler.record_function(f"hybrid_triton_w4a16 {M}x{N}x{K}") ) with ctx: + # Asymmetric layers carry the packed scale/zp carrier (scale + zero-point folded + # into one load); symmetric layers pass packed_scale_zp=None and the kernel + # reads scales directly with a constant -8 offset (no carrier overhead). output = triton_w4a16_skinny_fmt_gemm( - a=x_2d, - b_q=w_q_i32, - scales=w_s, - group_size=group_size, - zp=w_zp, + x_2d, + w_q_i32, + w_s, + group_size, + packed_scale_zp=packed_scale_zp, ) if bias is not None: output.add_(bias) @@ -439,6 +651,7 @@ def _hybrid_w4a16_apply_fake( bias: torch.Tensor | None, cu_count: int, group_size: int, + packed_scale_zp: torch.Tensor | None = None, ) -> torch.Tensor: M = x_2d.size(0) N = w_q.size(0) @@ -531,41 +744,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: if getattr(w_q_raw, "output_dim", 0) != 0: unpacked = unpacked.t().contiguous() - # ---- Pack into skinny format: [N, K//8] ExLlama shuffle ---- - shuffled = pack_int4_exllama_shuffle(unpacked) - - # ---- Pad K axis by +128 B per row on the gfx1151 cliff ---- - # On gfx1151 (Strix Halo) the int4 wvSplitK skinny kernel hits a sharp - # BW cliff when K_packed = K/2 is a multiple of 2048 B -- multi-row - # weight loads collide on memory-subsystem hash bits (DRAM channel - # and/or MALL slice) downstream of L2. Adding 128 B (one cache line / - # 32 int32 cols) to the row stride breaks the collision. Other gfx11x - # parts (Strix Point gfx1150, Krackan gfx115{2,3}) lack the - # multi-channel / MALL combination that produces the cliff and see no - # benefit from the pad, so the 3% weight-memory overhead is gated to - # gfx1151 only. - N_rows, K8 = shuffled.shape - K_packed_bytes = K8 * 4 # int32 -> bytes - # +128 B per row = one cache line, keeping each row cache-line aligned. - pad_int32 = 32 - if ( - on_gfx1151() - and K_packed_bytes % 2048 == 0 - and shuffled.device.type == "cuda" - ): - padded = torch.empty( - (N_rows, K8 + pad_int32), - dtype=torch.int32, - device=shuffled.device, - ) - padded[:, :K8].copy_(shuffled) - # Both views inherit stride(0) = K8 + pad_int32. - w_q_skinny_i32 = padded[:, :K8] - w_q_skinny = padded.view(torch.int8)[:, :K_packed_bytes] - else: - # Store as int8 for skinny kernel, keep int32 view for triton kernel - w_q_skinny_i32 = shuffled.contiguous() - w_q_skinny = w_q_skinny_i32.view(torch.int8) + # ---- Pack into skinny [N, K//8] (ExLlama shuffle + gfx1151 cliff pad) ---- + w_q_skinny, w_q_skinny_i32 = pack_skinny_int4(unpacked) # ---- Prepare skinny scales: normalize to [N, K//G] ---- permute_param_layout_(w_s_raw, input_dim=1, output_dim=0) @@ -598,6 +778,35 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: torch.nn.Parameter(w_q_skinny_i32, requires_grad=False), ) + # Packed scale/zp carrier for the Triton prefill path — built ONLY + # for asymmetric layers, where it folds the two per-group loads (scale + + # zp) into one. Symmetric layers skip it: the -8 offset is a constant, so + # there is no second load to fold and the carrier would be pure overhead + # (measured ~+8% on fp16 sym); sym reads scales directly instead. + # Layout (matches the kernel's HAS_ZP dequant): + # fp16: low16 = scale, high16 = bias_eff (= -8*scale - (zp-8)*scale). + # Consumed via one fp16 FMA with the magic-constant i4->fp16 unpack. + # bf16: low16 = scale (bf16 bits), high16 = zp_int (raw zp 0..15), as a + # plain integer. Consumed by the int-domain subtract (RDNA3 has no + # v_pk_fma_bf16). Bit-identical to the separate scale+zp loads. + if c.zero_points and c.act_type in (torch.float16, torch.bfloat16): + scale_u16 = w_s_skinny.view(torch.uint16).to(torch.int32) & 0xFFFF + if c.act_type == torch.float16: + w_s_f32 = w_s_skinny.to(torch.float32) + scaled_zp_f32 = (w_zp.to(torch.float32) - 8.0) * w_s_f32 + bias_eff = (-(8.0 * w_s_f32 + scaled_zp_f32)).to(c.act_type) + bias_u16 = bias_eff.contiguous().view(torch.uint16) + hi_u16 = bias_u16.to(torch.int32) & 0xFFFF + else: + hi_u16 = w_zp.to(torch.int32) & 0xFFFF # raw zp 0..15 + packed_scale_zp = ( + ((hi_u16 << 16) | scale_u16).view(torch.float32).contiguous() + ) + layer.register_parameter( + "_hybrid_w_packed_scale_zp", + torch.nn.Parameter(packed_scale_zp, requires_grad=False), + ) + def apply_weights( self, layer: torch.nn.Module, @@ -609,6 +818,8 @@ def apply_weights( c = self.config w_q, w_s, w_zp, _ = self._get_weight_params(layer) w_q_i32 = layer._hybrid_w_q_i32 + # Packed scale/zp carrier (asymmetric layers only; None for sym). + packed_scale_zp = getattr(layer, "_hybrid_w_packed_scale_zp", None) x_2d = x.reshape(-1, x.shape[-1]) N = w_q.shape[0] @@ -624,5 +835,6 @@ def apply_weights( bias, cu_count, c.group_size, + packed_scale_zp, ) return output.reshape(out_shape)