HanGuo97 · HanGuo97 · Dec 8, 2024 · Dec 8, 2024 · Dec 9, 2024 · Dec 9, 2024
diff --git a/.github/workflows/wheels.yaml b/.github/workflows/wheels.yaml
@@ -16,13 +16,15 @@ jobs:
     strategy:
       matrix:
           os: ['ubuntu-20.04']
-          python-version: ['3.8', '3.9', '3.10', '3.11']
+          python-version: ['3.9', '3.10', '3.11']
           pytorch-version: ['2.4.0']
           cuda-version: ['11.8', '12.1']
 
     steps:
       - name: Checkout
         uses: actions/checkout@v4
+        with:
+          submodules: recursive
 
       - name: Set up Linux Env
         if: ${{ runner.os == 'Linux' }}
@@ -32,7 +34,7 @@ jobs:
       - name: Set Swap Space
         uses: pierotofy/set-swap-space@master
         with:
-          swap-size-gb: 10
+          swap-size-gb: 12
 
       - name: Set up Python
         uses: actions/setup-python@v5

diff --git a/flute/__init__.py b/flute/__init__.py
@@ -2,100 +2,52 @@
 import torch
 import click
 from typing import Callable, cast
-from vllm.platforms import current_platform
 
 from . import _C
 from . import ops
 
-__version__ = "0.3.0"
+__version__ = "0.4.0"
 
-QGEMM_SIMPLE_TYPE = Callable[
-    [
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        int,
-        int,
-    ],
-    torch.Tensor,
-]
 
-QGEMM_RAW_SIMPLE_TYPE = Callable[
-    [
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
+qgemm = cast(
+    Callable[
+        [
+            torch.Tensor,  # inputs
+            torch.Tensor,  # weight
+            torch.Tensor,  # scales
+            torch.Tensor,  # tables
+            torch.Tensor,  # tables2
+            torch.Tensor,  # workspace
+            int,           # num_bits
+            int,           # group_size
+            int,           # template_id
+            int,           # num_sms
+        ],
         torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        int,
-        int,
-        int,
     ],
-    None,
-]
-
-QGEMM_HADAMARD_TYPE = Callable[
-    [
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
+    torch.ops.flute.qgemm_raw_simple,
+)
+
+
+qgemm_hadamard = cast(
+    Callable[
+        [
+            torch.Tensor,  # inputs
+            torch.Tensor,  # weight
+            torch.Tensor,  # scales
+            torch.Tensor,  # tables
+            torch.Tensor,  # tables2
+            torch.Tensor,  # workspace
+            int,           # num_bits
+            int,           # group_size
+            int,           # hadamard_size
+            int,           # template_id
+            int,           # num_sms
+        ],
         torch.Tensor,
-        int,
-        int,
-        int,
     ],
-    torch.Tensor,
-]
-
-
-# we use this instead of `torch.cuda.get_device_capability()` so that
-# it works better with multiprocessing (which vLLM uses)
-TORCH_CURRENT_DEVICE_CC = current_platform.get_device_capability()
-
-if TORCH_CURRENT_DEVICE_CC == (8, 6):
-    click.secho(f"[FLUTE]: Using A6000 with CC={TORCH_CURRENT_DEVICE_CC}", fg="green")
-    NUM_SMS = 84
-
-elif TORCH_CURRENT_DEVICE_CC == (8, 0):
-    click.secho(f"[FLUTE]: Using A100 with CC={TORCH_CURRENT_DEVICE_CC}", fg="green")
-    NUM_SMS = 108
-
-elif TORCH_CURRENT_DEVICE_CC == (8, 9):
-    click.secho(f"[FLUTE]: Using RTX4090 with CC={TORCH_CURRENT_DEVICE_CC}", fg="green")
-    NUM_SMS = 128
-
-else:
-    raise NotImplementedError
-
-
-QGEMM_SIMPLE_DICT = {
-    84 : cast(QGEMM_SIMPLE_TYPE, torch.ops.flute.qgemm_simple_86),
-    108: cast(QGEMM_SIMPLE_TYPE, torch.ops.flute.qgemm_simple_80),
-    128: cast(QGEMM_SIMPLE_TYPE, torch.ops.flute.qgemm_simple_89),
-}
-
-# QGEMM_RAW_SIMPLE_DICT = {
-#     84 : cast(QGEMM_RAW_SIMPLE_TYPE, torch.ops.flute.qgemm_raw_simple_86),
-#     108: cast(QGEMM_RAW_SIMPLE_TYPE, torch.ops.flute.qgemm_raw_simple_80),
-#     128: cast(QGEMM_RAW_SIMPLE_TYPE, torch.ops.flute.qgemm_raw_simple_89),
-# }
-
-QGEMM_HADAMARD_DICT = {
-    84 : cast(QGEMM_HADAMARD_TYPE, torch.ops.flute.qgemm_hadamard_86),
-    108: cast(QGEMM_HADAMARD_TYPE, torch.ops.flute.qgemm_hadamard_80),
-    128: cast(QGEMM_HADAMARD_TYPE, torch.ops.flute.qgemm_hadamard_89),
-}
-
-qgemm_simple     = QGEMM_SIMPLE_DICT[NUM_SMS]
-qgemm_raw_simple = None  # QGEMM_RAW_SIMPLE_DICT[NUM_SMS]
-qgemm_hadamard   = QGEMM_HADAMARD_DICT[NUM_SMS]
+    torch.ops.flute.qgemm_raw_simple_hadamard,
+)
 
 
 # Load the template configs
@@ -115,26 +67,3 @@
 else:
     TEMPLATE_CONFIGS = None
     click.secho(f"[FLUTE]: Template configs not found at {TEMPLATE_CONFIGS_PATH}", fg="red")
-
-
-# Load the tuned configs
-TEMPLATE_TUNED_WITH_M_CONFIGS_PATH = os.path.join(
-    os.path.dirname(os.path.abspath(__file__)),
-    "data/qgemm_kernel_raw_tuned_configs.pth")
-TEMPLATE_TUNED_WITHOUT_M_CONFIGS_PATH = os.path.join(
-    os.path.dirname(os.path.abspath(__file__)),
-    "data/qgemm_kernel_raw_tuned_configs.no-M.pth")
-
-if os.path.exists(TEMPLATE_TUNED_WITH_M_CONFIGS_PATH):
-    TEMPLATE_TUNED_WITH_M_CONFIGS = torch.load(TEMPLATE_TUNED_WITH_M_CONFIGS_PATH, weights_only=True)
-    click.secho(f"[FLUTE]: Template (tuned, with M) configs loaded from {TEMPLATE_TUNED_WITH_M_CONFIGS_PATH}", fg="green")
-else:
-    TEMPLATE_TUNED_WITH_M_CONFIGS = None
-    click.secho(f"[FLUTE]: Template (tuned, with M) configs not found at {TEMPLATE_TUNED_WITH_M_CONFIGS_PATH}", fg="red")
-
-if os.path.exists(TEMPLATE_TUNED_WITHOUT_M_CONFIGS_PATH):
-    TEMPLATE_TUNED_WITHOUT_M_CONFIGS = torch.load(TEMPLATE_TUNED_WITHOUT_M_CONFIGS_PATH, weights_only=True)
-    click.secho(f"[FLUTE]: Template (tuned, without M) configs loaded from {TEMPLATE_TUNED_WITHOUT_M_CONFIGS_PATH}", fg="green")
-else:
-    TEMPLATE_TUNED_WITHOUT_M_CONFIGS = None
-    click.secho(f"[FLUTE]: Template (tuned, without M) configs not found at {TEMPLATE_TUNED_WITHOUT_M_CONFIGS_PATH}", fg="red")