diff --git a/docker/Dockerfile.gpu b/docker/Dockerfile.gpu
index 474a6f5..f8cd061 100644
--- a/docker/Dockerfile.gpu
+++ b/docker/Dockerfile.gpu
@@ -42,9 +42,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
 
 # Download and install pre-compiled llama.cpp binary
 RUN wget -q https://github.com/ggml-org/llama.cpp/releases/download/${LLAMA_CPP_VERSION}/llama-${LLAMA_CPP_VERSION}-bin-ubuntu-x64.zip -O /tmp/llama.zip && \
-    unzip -q /tmp/llama.zip -d /opt && \
-    mv /opt/llama-${LLAMA_CPP_VERSION}-bin-ubuntu-x64 /opt/llama.cpp && \
-    rm /tmp/llama.zip && \
+    unzip -q /tmp/llama.zip -d /tmp/llama-extract && \
+    EXTRACTED_DIR=$(find /tmp/llama-extract -maxdepth 1 -type d -name "llama-*" | head -1) && \
+    if [ -z "$EXTRACTED_DIR" ]; then echo "Error: No llama directory found after extraction"; ls -la /tmp/llama-extract; exit 1; fi && \
+    mv "$EXTRACTED_DIR" /opt/llama.cpp && \
+    rm -rf /tmp/llama.zip /tmp/llama-extract && \
     chmod +x /opt/llama.cpp/llama-* && \
     pip install gguf
 
diff --git a/src/msquant/core/quantizer/engine.py b/src/msquant/core/quantizer/engine.py
index 79b97dc..18eca1a 100644
--- a/src/msquant/core/quantizer/engine.py
+++ b/src/msquant/core/quantizer/engine.py
@@ -277,6 +277,13 @@ def _check_llama_cpp_available():
     def _download_model(model_id: str, cache_dir: str, logger: QuantizationLogger) -> str:
         """Download HuggingFace model to local cache."""
         from huggingface_hub import snapshot_download
+        from huggingface_hub.errors import (
+            HfHubHTTPError,
+            RepositoryNotFoundError,
+            GatedRepoError,
+            LocalEntryNotFoundError,
+        )
+        from requests.exceptions import ConnectionError, Timeout
 
         logger.info(f"Downloading model {model_id} to cache...")
         try:
@@ -287,8 +294,47 @@ def _download_model(model_id: str, cache_dir: str, logger: QuantizationLogger) -
             )
             logger.info(f"Model downloaded to {local_path}")
             return local_path
+        except GatedRepoError as e:
+            # Must be before RepositoryNotFoundError since it's a subclass
+            raise RuntimeError(
+                f"Model '{model_id}' is gated and requires authentication. "
+                f"Please log in with 'huggingface-cli login' and ensure you have access."
+            ) from e
+        except LocalEntryNotFoundError as e:
+            # Must be before HfHubHTTPError since it's a subclass
+            raise RuntimeError(
+                f"Model files not found for '{model_id}'. "
+                f"The repository may be empty or misconfigured."
+            ) from e
+        except RepositoryNotFoundError as e:
+            raise RuntimeError(
+                f"Model '{model_id}' not found on HuggingFace Hub. "
+                f"Please verify the model ID is correct."
+            ) from e
+        except HfHubHTTPError as e:
+            if e.response.status_code == 401:
+                raise RuntimeError(
+                    f"Authentication failed for model '{model_id}'. "
+                    f"Please log in with 'huggingface-cli login'."
+                ) from e
+            elif e.response.status_code == 403:
+                raise RuntimeError(
+                    f"Access denied for model '{model_id}'. "
+                    f"You may need to accept the model's license agreement on HuggingFace Hub."
+                ) from e
+            else:
+                raise RuntimeError(
+                    f"HTTP error {e.response.status_code} while downloading model '{model_id}': {e}"
+                ) from e
+        except (ConnectionError, Timeout) as e:
+            raise RuntimeError(
+                f"Network error while downloading model '{model_id}'. "
+                f"Please check your internet connection and try again."
+            ) from e
         except Exception as e:
-            raise RuntimeError(f"Failed to download model: {e}") from e
+            raise RuntimeError(
+                f"Failed to download model '{model_id}': {e}"
+            ) from e
 
     @staticmethod
     def _convert_to_gguf_intermediate(
@@ -340,22 +386,30 @@ def _convert_to_gguf_intermediate(
         except Exception as e:
             raise RuntimeError(f"GGUF conversion failed: {e}") from e
 
+    @staticmethod
+    def _normalize_format(format_str: str) -> str:
+        """Normalize format string for comparison (uppercase, replace hyphens with underscores)."""
+        return format_str.upper().replace('-', '_')
+
     @staticmethod
     def _quantize_gguf(
         input_file: str,
         output_file: str,
         quant_type: str,
+        intermediate_format: str,
         logger: QuantizationLogger
     ):
         """Quantize GGUF file to target precision."""
         logger.info(f"Quantizing GGUF to {quant_type}...")
 
-        # Skip quantization if target format is already F16 or F32
-        if quant_type in ["F16", "F32"]:
-            logger.info(f"Target format {quant_type} matches intermediate format, skipping quantization")
-            # Copy the file instead
-            import shutil
-            shutil.copy2(input_file, output_file)
+        # Skip quantization if target format matches intermediate format
+        # Normalize both formats for comparison (handle case and separator differences)
+        if GGUFQuantizer._normalize_format(quant_type) == GGUFQuantizer._normalize_format(intermediate_format):
+            logger.info(f"Target format {quant_type} matches intermediate format {intermediate_format}, skipping quantization")
+            # Only copy if the filenames are different
+            if input_file != output_file:
+                import shutil
+                shutil.copy2(input_file, output_file)
             return
 
         # Build the quantization command
@@ -442,13 +496,22 @@ def run(config: QuantizationConfig, logger: QuantizationLogger):
             intermediate_file,
             final_file,
             config.gguf_quant_type,
+            config.gguf_intermediate_format,
             logger
         )
 
         # Clean up intermediate file if different from final
-        if intermediate_file != final_file and os.path.exists(intermediate_file):
-            logger.info(f"Cleaning up intermediate file: {intermediate_file}")
-            os.remove(intermediate_file)
+        # Use os.path.samefile to handle case-insensitive filesystems
+        try:
+            if os.path.exists(intermediate_file) and os.path.exists(final_file):
+                if not os.path.samefile(intermediate_file, final_file):
+                    logger.info(f"Cleaning up intermediate file: {intermediate_file}")
+                    os.remove(intermediate_file)
+        except (OSError, ValueError):
+            # If samefile fails, fall back to string comparison
+            if intermediate_file != final_file and os.path.exists(intermediate_file):
+                logger.info(f"Cleaning up intermediate file: {intermediate_file}")
+                os.remove(intermediate_file)
 
         dt = time.time() - t0
         logger.info(f"Completed. Saved GGUF quantized model to {final_file} in {dt:.1f}s")