huggingface · bigshanedogg · Mar 29, 2026 · Mar 29, 2026 · Apr 3, 2026 · Apr 7, 2026
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -634,6 +634,8 @@
         title: HunYuanDenseV1
       - local: model_doc/hunyuan_v1_moe
         title: HunYuanMoEV1
+      - local: model_doc/hyperclovax
+        title: HyperCLOVAX
       - local: model_doc/ibert
         title: I-BERT
       - local: model_doc/jais2

diff --git a/docs/source/en/model_doc/hyperclovax.md b/docs/source/en/model_doc/hyperclovax.md
@@ -0,0 +1,96 @@
+<!--Copyright 2026 NAVER Cloud Corp. and The HuggingFace Inc. team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+*This model was released on 2025-07-21 and added to Hugging Face Transformers on 2026-04-12.*
+
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
+    </div>
+</div>
+
+# HyperCLOVA X
+
+## Overview
+
+HyperCLOVA X SEED Think is NAVER Cloud's language model combining pruning and knowledge distillation with advanced reasoning capabilities. The 14B model features a Transformer-based architecture with Peri-Layer Normalization and Maximal Update Parameterization (μP), 14.74B parameters, and 32k context length. It supports dual-mode reasoning (think / non-think) and function calling via a ChatML-based format.
+
+The model was trained with a multi-stage RL pipeline (SFT → RLVR → Length Controllability → joint RLHF+RLVR) and achieves strong performance on Korean language benchmarks and reasoning tasks.
+
+This model was contributed by [NAVER Cloud HyperCLOVA X Team](https://huggingface.co/naver-hyperclovax). The original model can be found at [naver-hyperclovax/HyperCLOVAX-SEED-Think-14B](https://huggingface.co/naver-hyperclovax/HyperCLOVAX-SEED-Think-14B).
+
+## Usage
+
+The model uses a ChatML-based format with special tokens `<|im_start|>`, `<|im_end|>`, `<|endofturn|>`, and `<|stop|>`. The `apply_chat_template` method accepts the following kwargs:
+
+- `force_reasoning=True` — always think before answering
+- `skip_reasoning=True` — always answer directly (non-think mode)
+- Default (`None`) — model decides based on context
+
+<hfoptions id="usage">
+<hfoption id="AutoModelForCausalLM">
+
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_id = "naver-hyperclovax/HyperCLOVAX-SEED-Think-14B"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    dtype=torch.bfloat16,
-    dtype=torch.bfloat16,
-    dtype=torch.bfloat16,
+    device_map="auto",
+)
+
+messages = [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "What is the capital of South Korea?"},
+]
+# Pass force_reasoning=True to always think, or skip_reasoning=True to skip thinking.
+model_inputs = tokenizer.apply_chat_template(
+    messages,
+    add_generation_prompt=True,
+    return_tensors="pt",
+    # force_reasoning=True,
+    # skip_reasoning=True,
+).to(model.device)
+
+output = model.generate(
+    **model_inputs, 
+    max_new_tokens=200,
+    tokenizer=tokenizer,
+    stop_strings=["<|endofturn|>", "<|stop|>"],
+)
+print(tokenizer.decode(output[0][model_inputs["input_ids"].shape[-1]:], skip_special_tokens=True))
+```
+
+</hfoption>
+</hfoptions>
+
+## HyperCLOVAXConfig
+
+[[autodoc]] HyperCLOVAXConfig
+
+## HyperCLOVAXModel
+
+[[autodoc]] HyperCLOVAXModel
+    - forward
+
+## HyperCLOVAXForCausalLM
+
+[[autodoc]] HyperCLOVAXForCausalLM
+    - forward
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
@@ -191,6 +191,7 @@
     from .hubert import *
     from .hunyuan_v1_dense import *
     from .hunyuan_v1_moe import *
+    from .hyperclovax import *
     from .ibert import *
     from .idefics import *
     from .idefics2 import *

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
@@ -226,6 +226,7 @@
         ("hubert", "HubertConfig"),
         ("hunyuan_v1_dense", "HunYuanDenseV1Config"),
         ("hunyuan_v1_moe", "HunYuanMoEV1Config"),
+        ("hyperclovax", "HyperCLOVAXConfig"),
         ("ibert", "IBertConfig"),
         ("idefics", "IdeficsConfig"),
         ("idefics2", "Idefics2Config"),
@@ -746,6 +747,7 @@
         ("hubert", "Hubert"),
         ("hunyuan_v1_dense", "HunYuanDenseV1"),
         ("hunyuan_v1_moe", "HunYuanMoeV1"),
+        ("hyperclovax", "HyperCLOVAX"),
         ("ibert", "I-BERT"),
         ("idefics", "IDEFICS"),
         ("idefics2", "Idefics2"),

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
@@ -223,6 +223,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("hubert", "HubertModel"),
         ("hunyuan_v1_dense", "HunYuanDenseV1Model"),
         ("hunyuan_v1_moe", "HunYuanMoEV1Model"),
+        ("hyperclovax", "HyperCLOVAXModel"),
         ("ibert", "IBertModel"),
         ("idefics", "IdeficsModel"),
         ("idefics2", "Idefics2Model"),
@@ -676,6 +677,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("helium", "HeliumForCausalLM"),
         ("hunyuan_v1_dense", "HunYuanDenseV1ForCausalLM"),
         ("hunyuan_v1_moe", "HunYuanMoEV1ForCausalLM"),
+        ("hyperclovax", "HyperCLOVAXForCausalLM"),
         ("jais2", "Jais2ForCausalLM"),
         ("jamba", "JambaForCausalLM"),
         ("jetmoe", "JetMoeForCausalLM"),

diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
@@ -154,6 +154,7 @@
         ("groupvit", "CLIPTokenizer" if is_tokenizers_available() else None),
         ("herbert", "HerbertTokenizer" if is_tokenizers_available() else None),
         ("hubert", "Wav2Vec2CTCTokenizer"),
+        ("hyperclovax", "TokenizersBackend" if is_tokenizers_available() else None),
-        ("hyperclovax", "TokenizersBackend" if is_tokenizers_available() else None),
-        ("hyperclovax", "TokenizersBackend" if is_tokenizers_available() else None),
         ("ibert", "RobertaTokenizer"),
         ("idefics", "LlamaTokenizer" if is_tokenizers_available() else None),
         ("idefics2", "LlamaTokenizer" if is_tokenizers_available() else None),

diff --git a/src/transformers/models/hyperclovax/__init__.py b/src/transformers/models/hyperclovax/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2026 NAVER CLOUD Corp. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_hyperclovax import *
+    from .modeling_hyperclovax import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/hyperclovax/configuration_hyperclovax.py b/src/transformers/models/hyperclovax/configuration_hyperclovax.py
@@ -0,0 +1,138 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/hyperclovax/modular_hyperclovax.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_hyperclovax.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2026 NAVER CLOUD Corp. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from huggingface_hub.dataclasses import strict
+
+from ...configuration_utils import PreTrainedConfig
+from ...modeling_rope_utils import RopeParameters
+from ...utils import auto_docstring
+
+
+@auto_docstring(checkpoint="naver-hyperclovax/HyperCLOVAX-SEED-Think-14B")
+@strict
+class HyperCLOVAXConfig(PreTrainedConfig):
+    r"""
+    embedding_multiplier (`float`, *optional*, defaults to `1.0`):
+        Scaling factor applied to the token embedding outputs. Used in MuP to control the
+        scale of the embedding activations.
+    logits_scaling (`float`, *optional*, defaults to `1.0`):
+        Scaling factor **multiplied** to the final logits before loss computation or sampling.
+        Used in MuP to ensure consistent output scale across model sizes. Note: unlike
+        [`GraniteConfig`], this is a multiplier, not a divisor.
+    residual_multiplier (`float`, *optional*, defaults to `1.0`):
+        Scaling factor applied to each sub-layer output before adding to the residual stream.
+        Used in Maximal Update Parametrization (MuP) to stabilize training across model sizes.
+    attention_multiplier (`float`, *optional*, defaults to `head_dim ** -0.5`):
+        Scaling factor applied to attention logits before softmax, replacing the standard
+        `1 / sqrt(head_dim)` scaling. Set explicitly for MuP-based training; when `None`,
+        defaults to the standard value.
+    head_dim (`int`, *optional*, defaults to `hidden_size // num_attention_heads`):
+        The head dimension for attention. When `None`, defaults to `hidden_size // num_attention_heads`.
+    use_post_norm (`bool`, *optional*, defaults to `True`):
+        Whether to apply an extra RMSNorm after each sub-layer output (Peri-Layer Normalization).
+
+    ```python
+    >>> from transformers import HyperCLOVAXModel, HyperCLOVAXConfig
+
+    >>> # Initializing a HyperCLOVAX style configuration
+    >>> configuration = HyperCLOVAXConfig()
+
+    >>> # Initializing a model from the configuration
+    >>> model = HyperCLOVAXModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "hyperclovax"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    # Default tensor parallel plan for base model `HyperCLOVAXModel`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    vocab_size: int = 32000
+    hidden_size: int = 4096
+    intermediate_size: int = 11008
+    num_hidden_layers: int = 32
+    num_attention_heads: int = 32
+    num_key_value_heads: int | None = None
+    hidden_act: str = "silu"
+    max_position_embeddings: int = 2048
+    initializer_range: float = 0.02
+    rms_norm_eps: float = 1e-6
+    use_cache: bool = True
+    pad_token_id: int | None = None
+    bos_token_id: int | None = 1
+    eos_token_id: int | list[int] | None = 2
+    tie_word_embeddings: bool = False
+    rope_parameters: RopeParameters | dict | None = None
+    attention_bias: bool = False
+    attention_dropout: float | int = 0.0
+    mlp_bias: bool = False
+    embedding_multiplier: float | int = 1.0
+    logits_scaling: float | int = 1.0
+    residual_multiplier: float | int = 1.0
+
+    # MuP scaling factors: None means "resolve to the mathematically equivalent default".
+    attention_multiplier: float | None = None
+
+    head_dim: int | None = None
+    # Kept for backward compatibility with older HyperCLOVAX checkpoints; not used by the model.
+    pretraining_tp: int | None = 1
+
+    # Peri-Layer Normalization
+    use_post_norm: bool = True
+
+    def __post_init__(
+        self,
+        **kwargs,
+    ):
+        if self.head_dim is None:
+            self.head_dim = self.hidden_size // self.num_attention_heads
+        if self.num_key_value_heads is None:
+            self.num_key_value_heads = self.num_attention_heads
+
+        super().__post_init__(**kwargs)
+
+        # Resolve None MuP values to their mathematically equivalent defaults.
+        if self.attention_multiplier is None:
+            self.attention_multiplier = self.head_dim**-0.5
+
+    def validate_architecture(self):
+        """Validates that `hidden_size` is divisible by `num_attention_heads`."""
+        if self.hidden_size % self.num_attention_heads != 0:
+            raise ValueError(
+                f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({self.num_attention_heads})."
+            )
+
+
+__all__ = ["HyperCLOVAXConfig"]