[Kernel] Flash Attention 3 Support (#12093)

Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
This commit is contained in:
Lucas Wilkinson
2025-01-23 09:45:48 -05:00
committed by GitHub
parent c5b4b11d7f
commit 978b45f399
8 changed files with 151 additions and 83 deletions
+8 -4
View File
@@ -228,8 +228,11 @@ class cmake_build_ext(build_ext):
# CMake appends the extension prefix to the install path,
# and outdir already contains that prefix, so we need to remove it.
# We assume only the final component of extension prefix is added by
# CMake, this is currently true for current extensions but may not
# always be the case.
prefix = outdir
for i in range(ext.name.count('.')):
if '.' in ext.name:
prefix = prefix.parent
# prefix here should actually be the same for all components
@@ -298,7 +301,8 @@ class repackage_wheel(build_ext):
files_to_copy = [
"vllm/_C.abi3.so",
"vllm/_moe_C.abi3.so",
"vllm/vllm_flash_attn/vllm_flash_attn_c.abi3.so",
"vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
"vllm/vllm_flash_attn/flash_attn_interface.py",
"vllm/vllm_flash_attn/__init__.py",
"vllm/cumem_allocator.abi3.so",
@@ -593,8 +597,8 @@ if _is_hip():
ext_modules.append(CMakeExtension(name="vllm._rocm_C"))
if _is_cuda():
ext_modules.append(
CMakeExtension(name="vllm.vllm_flash_attn.vllm_flash_attn_c"))
ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C"))
ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
ext_modules.append(CMakeExtension(name="vllm.cumem_allocator"))
if _build_custom_ops():