diff --git a/docker/Dockerfile.gpu b/docker/Dockerfile.gpu index 474a6f5..f8cd061 100644 --- a/docker/Dockerfile.gpu +++ b/docker/Dockerfile.gpu @@ -42,9 +42,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ # Download and install pre-compiled llama.cpp binary RUN wget -q https://github.com/ggml-org/llama.cpp/releases/download/${LLAMA_CPP_VERSION}/llama-${LLAMA_CPP_VERSION}-bin-ubuntu-x64.zip -O /tmp/llama.zip && \ - unzip -q /tmp/llama.zip -d /opt && \ - mv /opt/llama-${LLAMA_CPP_VERSION}-bin-ubuntu-x64 /opt/llama.cpp && \ - rm /tmp/llama.zip && \ + unzip -q /tmp/llama.zip -d /tmp/llama-extract && \ + EXTRACTED_DIR=$(find /tmp/llama-extract -maxdepth 1 -type d -name "llama-*" | head -1) && \ + if [ -z "$EXTRACTED_DIR" ]; then echo "Error: No llama directory found after extraction"; ls -la /tmp/llama-extract; exit 1; fi && \ + mv "$EXTRACTED_DIR" /opt/llama.cpp && \ + rm -rf /tmp/llama.zip /tmp/llama-extract && \ chmod +x /opt/llama.cpp/llama-* && \ pip install gguf diff --git a/src/msquant/core/quantizer/engine.py b/src/msquant/core/quantizer/engine.py index 79b97dc..18eca1a 100644 --- a/src/msquant/core/quantizer/engine.py +++ b/src/msquant/core/quantizer/engine.py @@ -277,6 +277,13 @@ def _check_llama_cpp_available(): def _download_model(model_id: str, cache_dir: str, logger: QuantizationLogger) -> str: """Download HuggingFace model to local cache.""" from huggingface_hub import snapshot_download + from huggingface_hub.errors import ( + HfHubHTTPError, + RepositoryNotFoundError, + GatedRepoError, + LocalEntryNotFoundError, + ) + from requests.exceptions import ConnectionError, Timeout logger.info(f"Downloading model {model_id} to cache...") try: @@ -287,8 +294,47 @@ def _download_model(model_id: str, cache_dir: str, logger: QuantizationLogger) - ) logger.info(f"Model downloaded to {local_path}") return local_path + except GatedRepoError as e: + # Must be before RepositoryNotFoundError since it's a subclass + raise RuntimeError( + f"Model '{model_id}' is gated and requires authentication. " + f"Please log in with 'huggingface-cli login' and ensure you have access." + ) from e + except LocalEntryNotFoundError as e: + # Must be before HfHubHTTPError since it's a subclass + raise RuntimeError( + f"Model files not found for '{model_id}'. " + f"The repository may be empty or misconfigured." + ) from e + except RepositoryNotFoundError as e: + raise RuntimeError( + f"Model '{model_id}' not found on HuggingFace Hub. " + f"Please verify the model ID is correct." + ) from e + except HfHubHTTPError as e: + if e.response.status_code == 401: + raise RuntimeError( + f"Authentication failed for model '{model_id}'. " + f"Please log in with 'huggingface-cli login'." + ) from e + elif e.response.status_code == 403: + raise RuntimeError( + f"Access denied for model '{model_id}'. " + f"You may need to accept the model's license agreement on HuggingFace Hub." + ) from e + else: + raise RuntimeError( + f"HTTP error {e.response.status_code} while downloading model '{model_id}': {e}" + ) from e + except (ConnectionError, Timeout) as e: + raise RuntimeError( + f"Network error while downloading model '{model_id}'. " + f"Please check your internet connection and try again." + ) from e except Exception as e: - raise RuntimeError(f"Failed to download model: {e}") from e + raise RuntimeError( + f"Failed to download model '{model_id}': {e}" + ) from e @staticmethod def _convert_to_gguf_intermediate( @@ -340,22 +386,30 @@ def _convert_to_gguf_intermediate( except Exception as e: raise RuntimeError(f"GGUF conversion failed: {e}") from e + @staticmethod + def _normalize_format(format_str: str) -> str: + """Normalize format string for comparison (uppercase, replace hyphens with underscores).""" + return format_str.upper().replace('-', '_') + @staticmethod def _quantize_gguf( input_file: str, output_file: str, quant_type: str, + intermediate_format: str, logger: QuantizationLogger ): """Quantize GGUF file to target precision.""" logger.info(f"Quantizing GGUF to {quant_type}...") - # Skip quantization if target format is already F16 or F32 - if quant_type in ["F16", "F32"]: - logger.info(f"Target format {quant_type} matches intermediate format, skipping quantization") - # Copy the file instead - import shutil - shutil.copy2(input_file, output_file) + # Skip quantization if target format matches intermediate format + # Normalize both formats for comparison (handle case and separator differences) + if GGUFQuantizer._normalize_format(quant_type) == GGUFQuantizer._normalize_format(intermediate_format): + logger.info(f"Target format {quant_type} matches intermediate format {intermediate_format}, skipping quantization") + # Only copy if the filenames are different + if input_file != output_file: + import shutil + shutil.copy2(input_file, output_file) return # Build the quantization command @@ -442,13 +496,22 @@ def run(config: QuantizationConfig, logger: QuantizationLogger): intermediate_file, final_file, config.gguf_quant_type, + config.gguf_intermediate_format, logger ) # Clean up intermediate file if different from final - if intermediate_file != final_file and os.path.exists(intermediate_file): - logger.info(f"Cleaning up intermediate file: {intermediate_file}") - os.remove(intermediate_file) + # Use os.path.samefile to handle case-insensitive filesystems + try: + if os.path.exists(intermediate_file) and os.path.exists(final_file): + if not os.path.samefile(intermediate_file, final_file): + logger.info(f"Cleaning up intermediate file: {intermediate_file}") + os.remove(intermediate_file) + except (OSError, ValueError): + # If samefile fails, fall back to string comparison + if intermediate_file != final_file and os.path.exists(intermediate_file): + logger.info(f"Cleaning up intermediate file: {intermediate_file}") + os.remove(intermediate_file) dt = time.time() - t0 logger.info(f"Completed. Saved GGUF quantized model to {final_file} in {dt:.1f}s")