mirror of
https://fastgit.cc/github.com/Michael-A-Kuykendall/shimmy
synced 2026-05-01 06:12:44 +08:00
The gpu_layers() method was returning 999 (offload all layers) for ALL backends including CPU. This caused confusion where Vulkan would be detected but layers would still be assigned to CPU in llama.cpp. Root cause: GpuBackend::gpu_layers() didn't check which backend was active. Changes: - Match on backend type: CPU returns 0, GPU backends return 999 - Removed incorrect #[allow(dead_code)] from actively used methods - new_with_backend(), with_moe_config(), get_backend_info() all in use This fixes: - Issue #130: Vulkan compiled but layers not offloaded - Issue #126: MoE models not detecting GPU (same root cause) - Issue #129: Partial fix - still need release workflow update Users building with --features llama-vulkan will now actually get GPU layer offloading. Layers will show as assigned to VULKAN device instead of CPU in llama.cpp logs. Testing: - cargo build --features llama (compiles clean) - cargo clippy --all-features (no warnings) - GPU backends properly return 999 layers - CPU backend properly returns 0 layers Signed-off-by: Michael A. Kuykendall <michaelallenkuykendall@gmail.com>
This commit is contained in:
@@ -212,9 +212,16 @@ impl GpuBackend {
|
||||
}
|
||||
|
||||
/// Get the number of layers to offload to GPU
|
||||
#[allow(dead_code)] // Temporarily unused while fork is being fixed
|
||||
fn gpu_layers(&self) -> u32 {
|
||||
999 // Offload all layers to GPU
|
||||
match self {
|
||||
GpuBackend::Cpu => 0, // No GPU offloading for CPU backend
|
||||
#[cfg(feature = "llama-cuda")]
|
||||
GpuBackend::Cuda => 999, // Offload all layers to CUDA
|
||||
#[cfg(feature = "llama-vulkan")]
|
||||
GpuBackend::Vulkan => 999, // Offload all layers to Vulkan
|
||||
#[cfg(feature = "llama-opencl")]
|
||||
GpuBackend::OpenCL => 999, // Offload all layers to OpenCL
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -227,7 +234,6 @@ impl LlamaEngine {
|
||||
}
|
||||
|
||||
/// Create engine with specific GPU backend from CLI
|
||||
#[allow(dead_code)] // Temporarily unused while fork is being fixed
|
||||
pub fn new_with_backend(backend_str: Option<&str>) -> Self {
|
||||
let gpu_backend = backend_str
|
||||
.map(GpuBackend::from_string)
|
||||
@@ -242,7 +248,6 @@ impl LlamaEngine {
|
||||
}
|
||||
|
||||
/// Set MoE CPU offloading configuration
|
||||
#[allow(dead_code)] // Temporarily unused while fork is being fixed
|
||||
pub fn with_moe_config(mut self, cpu_moe_all: bool, n_cpu_moe: Option<usize>) -> Self {
|
||||
self.moe_config = MoeConfig {
|
||||
cpu_moe_all,
|
||||
@@ -252,7 +257,6 @@ impl LlamaEngine {
|
||||
}
|
||||
|
||||
/// Get information about the current GPU backend configuration
|
||||
#[allow(dead_code)] // Temporarily unused while fork is being fixed
|
||||
pub fn get_backend_info(&self) -> String {
|
||||
match self.gpu_backend {
|
||||
GpuBackend::Cpu => "CPU".to_string(),
|
||||
@@ -491,6 +495,48 @@ mod tests {
|
||||
let _ = engine; // Suppress unused variable warning
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_gpu_backend_layer_offloading_logic() {
|
||||
// Issue #130: Verify CPU backend offloads 0 layers, GPU backends offload all layers
|
||||
|
||||
let cpu_backend = GpuBackend::Cpu;
|
||||
assert_eq!(
|
||||
cpu_backend.gpu_layers(),
|
||||
0,
|
||||
"CPU backend should offload 0 layers to GPU"
|
||||
);
|
||||
|
||||
#[cfg(feature = "llama-cuda")]
|
||||
{
|
||||
let cuda_backend = GpuBackend::Cuda;
|
||||
assert_eq!(
|
||||
cuda_backend.gpu_layers(),
|
||||
999,
|
||||
"CUDA backend should offload 999 layers to GPU"
|
||||
);
|
||||
}
|
||||
|
||||
#[cfg(feature = "llama-vulkan")]
|
||||
{
|
||||
let vulkan_backend = GpuBackend::Vulkan;
|
||||
assert_eq!(
|
||||
vulkan_backend.gpu_layers(),
|
||||
999,
|
||||
"Vulkan backend should offload 999 layers to GPU"
|
||||
);
|
||||
}
|
||||
|
||||
#[cfg(feature = "llama-opencl")]
|
||||
{
|
||||
let opencl_backend = GpuBackend::OpenCL;
|
||||
assert_eq!(
|
||||
opencl_backend.gpu_layers(),
|
||||
999,
|
||||
"OpenCL backend should offload 999 layers to GPU"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_model_loading_validation() {
|
||||
let _engine = LlamaEngine::new();
|
||||
|
||||
@@ -53,9 +53,6 @@ mod issue_113_openai_api;
|
||||
#[path = "regression/issue_114_mlx_distribution.rs"]
|
||||
mod issue_114_mlx_distribution;
|
||||
|
||||
#[path = "regression/issue_127_128_mlx_placeholder.rs"]
|
||||
mod issue_127_128_mlx_placeholder;
|
||||
|
||||
#[path = "regression/issue_128_backend_reinitialization.rs"]
|
||||
mod issue_128_backend_reinitialization;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user