cr

HabanaAI · Feb 3, 2025 · 4b2fd74 · 4b2fd74
1 parent a2357af
commit 4b2fd74
Show file tree

Hide file tree

Showing 2 changed files with 8 additions and 6 deletions.
diff --git a/...el_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/...el_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -7,8 +7,8 @@
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    apply_fp8_linear, cutlass_fp8_supported, normalize_e4m3fn_to_e4m3fnuz,
-    requantize_with_max_scale, is_gaudi2)
+    apply_fp8_linear, cutlass_fp8_supported, get_gaudi2_scale_factor, is_gaudi2, normalize_e4m3fn_to_e4m3fnuz,
+    requantize_with_max_scale)
 from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
                                            ModelWeightParameter,
                                            PerTensorScaleParameter)
@@ -84,8 +84,7 @@ def process_weights_after_loading(self, layer) -> None:
         if self.is_static_input_scheme and hasattr(layer, 'input_scale'):
             input_scale = layer.input_scale.max()
             if is_gaudi2():
-                input_scale = input_scale * (torch.finfo(torch.float8_e4m3fn).max /
-                                     torch.finfo(torch.float8_e4m3fnuz).max)
+                input_scale = input_scale * get_gaudi2_scale_factor()
             layer.input_scale = Parameter(input_scale,
                                           requires_grad=False)
         else:

diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -77,14 +77,17 @@ def is_gaudi2():
     return current_platform.is_hpu() and htexp._get_device_type(
     ) == htexp.synDeviceType.synDeviceGaudi2
 
+def get_gaudi2_scale_factor():
+    return (torch.finfo(torch.float8_e4m3fn).max /
+            torch.finfo(torch.float8_e4m3fnuz).max)
+
 def requantize_with_max_scale(
         weight: torch.Tensor, weight_scale: torch.Tensor,
         logical_widths: List[int]) -> Tuple[torch.Tensor, torch.Tensor]:
     # Max scale to be used for requanitzation.
     max_w_scale = weight_scale.max()
     if is_gaudi2():
-        max_w_scale = max_w_scale * (torch.finfo(torch.float8_e4m3fn).max /
-                                     torch.finfo(torch.float8_e4m3fnuz).max)
+        max_w_scale = max_w_scale * get_gaudi2_scale_factor()
     # QKV / MLP is fused in the on disk checkpoint if any of the
     # weight scales are still set to the default since we initialize
     # N weight scales for N shards but we only load 1 weight scale