Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions docker/Dockerfile.gpu
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \

# Download and install pre-compiled llama.cpp binary
RUN wget -q https://github.com/ggml-org/llama.cpp/releases/download/${LLAMA_CPP_VERSION}/llama-${LLAMA_CPP_VERSION}-bin-ubuntu-x64.zip -O /tmp/llama.zip && \
unzip -q /tmp/llama.zip -d /opt && \
mv /opt/llama-${LLAMA_CPP_VERSION}-bin-ubuntu-x64 /opt/llama.cpp && \
rm /tmp/llama.zip && \
unzip -q /tmp/llama.zip -d /tmp/llama-extract && \
EXTRACTED_DIR=$(find /tmp/llama-extract -maxdepth 1 -type d -name "llama-*" | head -1) && \
if [ -z "$EXTRACTED_DIR" ]; then echo "Error: No llama directory found after extraction"; ls -la /tmp/llama-extract; exit 1; fi && \
Copy link

Copilot AI Nov 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] The error handling combines multiple operations on a single line with &&, making it harder to read and debug. Consider splitting into multiple RUN commands or separate lines within the same RUN for better clarity.

Copilot uses AI. Check for mistakes.
mv "$EXTRACTED_DIR" /opt/llama.cpp && \
rm -rf /tmp/llama.zip /tmp/llama-extract && \
chmod +x /opt/llama.cpp/llama-* && \
pip install gguf

Expand Down
83 changes: 73 additions & 10 deletions src/msquant/core/quantizer/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,13 @@ def _check_llama_cpp_available():
def _download_model(model_id: str, cache_dir: str, logger: QuantizationLogger) -> str:
"""Download HuggingFace model to local cache."""
from huggingface_hub import snapshot_download
from huggingface_hub.errors import (
HfHubHTTPError,
RepositoryNotFoundError,
GatedRepoError,
LocalEntryNotFoundError,
)
from requests.exceptions import ConnectionError, Timeout

logger.info(f"Downloading model {model_id} to cache...")
try:
Expand All @@ -287,8 +294,47 @@ def _download_model(model_id: str, cache_dir: str, logger: QuantizationLogger) -
)
logger.info(f"Model downloaded to {local_path}")
return local_path
except GatedRepoError as e:
# Must be before RepositoryNotFoundError since it's a subclass
raise RuntimeError(
f"Model '{model_id}' is gated and requires authentication. "
f"Please log in with 'huggingface-cli login' and ensure you have access."
) from e
except LocalEntryNotFoundError as e:
# Must be before HfHubHTTPError since it's a subclass
raise RuntimeError(
f"Model files not found for '{model_id}'. "
f"The repository may be empty or misconfigured."
) from e
except RepositoryNotFoundError as e:
raise RuntimeError(
f"Model '{model_id}' not found on HuggingFace Hub. "
f"Please verify the model ID is correct."
) from e
except HfHubHTTPError as e:
if e.response.status_code == 401:
raise RuntimeError(
f"Authentication failed for model '{model_id}'. "
f"Please log in with 'huggingface-cli login'."
) from e
elif e.response.status_code == 403:
raise RuntimeError(
f"Access denied for model '{model_id}'. "
f"You may need to accept the model's license agreement on HuggingFace Hub."
) from e
else:
Comment on lines +315 to +325
Copy link

Copilot AI Nov 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Potential AttributeError if HfHubHTTPError doesn't have a response attribute or if response is None. Add a guard check: if hasattr(e, 'response') and e.response and e.response.status_code == 401:

Suggested change
if e.response.status_code == 401:
raise RuntimeError(
f"Authentication failed for model '{model_id}'. "
f"Please log in with 'huggingface-cli login'."
) from e
elif e.response.status_code == 403:
raise RuntimeError(
f"Access denied for model '{model_id}'. "
f"You may need to accept the model's license agreement on HuggingFace Hub."
) from e
else:
if hasattr(e, 'response') and e.response and e.response.status_code == 401:
raise RuntimeError(
f"Authentication failed for model '{model_id}'. "
f"Please log in with 'huggingface-cli login'."
) from e
elif hasattr(e, 'response') and e.response and e.response.status_code == 403:
raise RuntimeError(
f"Access denied for model '{model_id}'. "
f"You may need to accept the model's license agreement on HuggingFace Hub."
) from e
elif hasattr(e, 'response') and e.response and hasattr(e.response, 'status_code'):

Copilot uses AI. Check for mistakes.
raise RuntimeError(
f"HTTP error {e.response.status_code} while downloading model '{model_id}': {e}"
) from e
except (ConnectionError, Timeout) as e:
raise RuntimeError(
f"Network error while downloading model '{model_id}'. "
f"Please check your internet connection and try again."
) from e
except Exception as e:
raise RuntimeError(f"Failed to download model: {e}") from e
raise RuntimeError(
f"Failed to download model '{model_id}': {e}"
) from e

@staticmethod
def _convert_to_gguf_intermediate(
Expand Down Expand Up @@ -340,22 +386,30 @@ def _convert_to_gguf_intermediate(
except Exception as e:
raise RuntimeError(f"GGUF conversion failed: {e}") from e

@staticmethod
def _normalize_format(format_str: str) -> str:
"""Normalize format string for comparison (uppercase, replace hyphens with underscores)."""
return format_str.upper().replace('-', '_')

@staticmethod
def _quantize_gguf(
input_file: str,
output_file: str,
quant_type: str,
intermediate_format: str,
logger: QuantizationLogger
):
"""Quantize GGUF file to target precision."""
logger.info(f"Quantizing GGUF to {quant_type}...")

# Skip quantization if target format is already F16 or F32
if quant_type in ["F16", "F32"]:
logger.info(f"Target format {quant_type} matches intermediate format, skipping quantization")
# Copy the file instead
import shutil
shutil.copy2(input_file, output_file)
# Skip quantization if target format matches intermediate format
# Normalize both formats for comparison (handle case and separator differences)
if GGUFQuantizer._normalize_format(quant_type) == GGUFQuantizer._normalize_format(intermediate_format):
logger.info(f"Target format {quant_type} matches intermediate format {intermediate_format}, skipping quantization")
# Only copy if the filenames are different
if input_file != output_file:
import shutil
shutil.copy2(input_file, output_file)
return

# Build the quantization command
Expand Down Expand Up @@ -442,13 +496,22 @@ def run(config: QuantizationConfig, logger: QuantizationLogger):
intermediate_file,
final_file,
config.gguf_quant_type,
config.gguf_intermediate_format,
logger
)

# Clean up intermediate file if different from final
if intermediate_file != final_file and os.path.exists(intermediate_file):
logger.info(f"Cleaning up intermediate file: {intermediate_file}")
os.remove(intermediate_file)
# Use os.path.samefile to handle case-insensitive filesystems
try:
if os.path.exists(intermediate_file) and os.path.exists(final_file):
if not os.path.samefile(intermediate_file, final_file):
logger.info(f"Cleaning up intermediate file: {intermediate_file}")
os.remove(intermediate_file)
Comment on lines +506 to +509
Copy link

Copilot AI Nov 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] The nested condition on line 507 can be combined with line 506 for better readability: if os.path.exists(intermediate_file) and os.path.exists(final_file) and not os.path.samefile(intermediate_file, final_file):

Suggested change
if os.path.exists(intermediate_file) and os.path.exists(final_file):
if not os.path.samefile(intermediate_file, final_file):
logger.info(f"Cleaning up intermediate file: {intermediate_file}")
os.remove(intermediate_file)
if os.path.exists(intermediate_file) and os.path.exists(final_file) and not os.path.samefile(intermediate_file, final_file):
logger.info(f"Cleaning up intermediate file: {intermediate_file}")
os.remove(intermediate_file)

Copilot uses AI. Check for mistakes.
except (OSError, ValueError):
# If samefile fails, fall back to string comparison
if intermediate_file != final_file and os.path.exists(intermediate_file):
logger.info(f"Cleaning up intermediate file: {intermediate_file}")
os.remove(intermediate_file)
Comment on lines +510 to +514
Copy link

Copilot AI Nov 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] The fallback logic duplicates the cleanup code from lines 500-501. Consider extracting the cleanup logic into a helper function or restructuring to avoid duplication.

Copilot uses AI. Check for mistakes.

dt = time.time() - t0
logger.info(f"Completed. Saved GGUF quantized model to {final_file} in {dt:.1f}s")
Expand Down