rope,embedding验证:

miaobyte · miaobyte · commit 3984a99fd46f · 2025-05-17T22:34:20.000+08:00
diff --git a/.github/ISSUE_TEMPLATE/operator.md b/.github/ISSUE_TEMPLATE/operator.md
@@ -0,0 +1,29 @@
+---
+name: 算子新增
+about: 用于提交新的算子实现请求
+title: '[算子] '
+labels: enhancement, operator
+assignees: ''
+---
+
+## 算子新增
+该算子数学表达为
+
+## 影响组件
+
+### front
+1. 
+2.
+
+### 引擎
+1. 
+2. 
+
+## 其他叙述
+
+<!-- 请在此处添加其他相关信息，如：
+- 参考实现（如PyTorch中的实现）
+- 性能要求
+- 测试用例
+- 其他注意事项
+-->
diff --git a/front/py/deepx/tensor/tensor.py b/front/py/deepx/tensor/tensor.py
@@ -119,7 +119,8 @@ def __radd__(self, other:Union[Number,'Tensor']):
     def __sub__(self, other:Union[Number,'Tensor']):
         return self.sub(other)
     def __rsub__(self, other:Union[Number,'Tensor']):
-        return self.sub(other)
+        x=self.mul(-1)
+        return x.add(other)
     def __mul__(self, other:Union[Number,'Tensor']):
         return self.mul(other)
     def __rmul__(self, other:Union[Number,'Tensor']):
@@ -156,7 +157,7 @@ def __matmul__(self, other:'Tensor'):
         return self.matmul(other)
     def __rmatmul__(self, other:'Tensor'):
         return other.matmul(self)
-    #gather
+
     def __getitem__(self, index:'Tensor'):
         return self.indexselect(index)
 
diff --git a/front/py/deepx/transformer/modeling_rope_utils.py b/front/py/deepx/transformer/modeling_rope_utils.py
@@ -27,24 +27,27 @@ def _compute_llama3_parameters(config:dict={
     # Gets the default RoPE parameters
     inv_freq, attention_factor = _compute_default_rope_parameters(config)
 
+    factor = config["rope_scaling"]["factor"]  # `8` in the original implementation
     low_freq_factor = config["rope_scaling"]["low_freq_factor"]  # `1` in the original implementation
     high_freq_factor = config["rope_scaling"]["high_freq_factor"]  # `4` in the original implementation
     old_context_len = config["rope_scaling"]["original_max_position_embeddings"]  # `8192` in the original implementation
-    low_freq_wavelen = old_context_len /low_freq_factor
-    high_freq_wavelen = old_context_len/ high_freq_factor
+
+    low_freq_wavelen = old_context_len / low_freq_factor
+    high_freq_wavelen = old_context_len / high_freq_factor
 
     wavelen = 2 * math.pi / inv_freq
-    factor=config["rope_scaling"]["factor"]
-    cases=wavelen > low_freq_wavelen
-    inv_freq_llama =  where(cases, inv_freq /factor, inv_freq)
+
+    # wavelen < high_freq_wavelen: do nothing
+    # wavelen > low_freq_wavelen: divide by factor
+    inv_freq_llama = where(wavelen > low_freq_wavelen, inv_freq / factor, inv_freq)
     # otherwise: interpolate between the two, using a smooth factor
-    smooth_factor = (old_context_len / wavelen -low_freq_factor) / ( high_freq_factor -  low_freq_factor)
-    smoothed_inv_freq = (1 - smooth_factor) * inv_freq_llama /  factor + smooth_factor * inv_freq_llama
+    smooth_factor = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
+    smoothed_inv_freq = (1 - smooth_factor) * inv_freq_llama / factor + smooth_factor * inv_freq_llama
     is_medium_freq = ~(wavelen < high_freq_wavelen) * ~(wavelen > low_freq_wavelen)
     inv_freq_llama =  where(is_medium_freq, smoothed_inv_freq, inv_freq_llama)
 
     return inv_freq_llama, attention_factor
- 
+
 ROPE_INIT_FUNCTIONS = {
     "default": _compute_default_rope_parameters,
     # "linear": _compute_linear_scaling_rope_parameters,
diff --git a/front/py/examples/4_transformer/llama/llama_rope.py b/front/py/examples/4_transformer/llama/llama_rope.py
@@ -1,69 +1,4 @@
-hidden_size = 8
-eps = 1e-6
-dir='/home/lipeng/model/deepxmodel/llama/'
-model_path="/home/lipeng/model/deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
-print()
-
-from transformers import AutoTokenizer,AutoConfig
-def init_tokenizer(model_path):
-    tokenizer = AutoTokenizer.from_pretrained(model_path)
-    tokenizer.pad_token = tokenizer.eos_token
-    return tokenizer
-
-tokenizer = init_tokenizer(model_path)
-config=AutoConfig.from_pretrained(model_path)
-def tokenize_text(text, tokenizer):
-    tokens = tokenizer(text, return_tensors="pt").input_ids
-    import torch
-    # 处理超出词汇表范围的token
-    if torch.any(tokens >= tokenizer.vocab_size):
-        # 获取UNK token ID，如果没有则使用0
-        unk_token_id = tokenizer.unk_token_id if hasattr(tokenizer, 'unk_token_id') and tokenizer.unk_token_id is not None else 0
-        # 替换所有超出范围的token为UNK
-        tokens = torch.where(tokens < tokenizer.vocab_size, tokens, torch.tensor(unk_token_id, device=tokens.device))
-    return tokens
- 
-############-------PyTorch-------################
-import torch 
-
-# 创建输入
-text = "这是一个测试文本，用于演示嵌入层的使用。"
-torch_input = tokenize_text(text, tokenizer)
-from deepxutil.torch import save_torch
-save_torch(torch_input,dir+'input')
-
-# 创建网络
-
-class NetTorch(torch.nn.Module):
-    from transformers.models.llama.modeling_llama import LlamaConfig
-    def __init__(self,config:LlamaConfig):
-        super().__init__()
-        self.padding_idx = config.pad_token_id
-        self.config = config
-        self.embed_tokens = torch.nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        from transformers.models.llama.modeling_llama import LlamaRotaryEmbedding
-        self.rotary_emb = LlamaRotaryEmbedding(config=config)
-
-    def forward(self,x):
-        inputs_embeds = self.embed_tokens(x)
-        hidden_states = inputs_embeds
-        # create position embeddings to be shared across the decoder layers
-        position_ids = torch.arange(hidden_states.shape[1], device=hidden_states.device).unsqueeze(0)
-        return self.rotary_emb(hidden_states, position_ids)
- 
- 
-torch_net = NetTorch(config)
-save_torch(torch_net.embed_tokens.weight,dir+'weight')
-# 前向传播
-torch_output = torch_net(torch_input)
-torch_sin, torch_cos = torch_output
-
-print("sin shape:",torch_sin.shape)
-print("sin:", torch_sin)
-
-print("cos shape:", torch_cos.shape)
-print("cos:", torch_cos)
-
+from .llama_rope_torch import dir,config
 
 ############-------DEEPX-------################
 from deepx.nn.modules import Embedding,Module
@@ -86,10 +21,10 @@ def forward(self,x):
         position_ids = arange(start=0,end=hidden_states.shape[1]).unsqueeze(0)
         return self.rotary_emb(hidden_states, position_ids)
 
-net = NetDeepx(configdict=config.to_dict())
-out=net.forward(input)
-out[0].print()
-out[1].print()
-
+if __name__ == "__main__":
+    net = NetDeepx(configdict=config.to_dict())
+    out=net.forward(input)
+    out[0].print()
+    out[1].print()
 
 
diff --git a/front/py/examples/4_transformer/llama/llama_rope_torch.py b/front/py/examples/4_transformer/llama/llama_rope_torch.py
@@ -52,23 +52,25 @@ def __init__(self, config: LlamaConfig):
         self.embed_tokens = torch.nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
         from transformers.models.llama.modeling_llama import LlamaRotaryEmbedding
         self.rotary_emb = LlamaRotaryEmbedding(config=config)
-
+        print("rotary_emb.inv_freq")
+        print(self.rotary_emb.inv_freq)
     def forward(self, x):
         inputs_embeds = self.embed_tokens(x)
+        print(inputs_embeds)
         hidden_states = inputs_embeds
         # create position embeddings to be shared across the decoder layers
         position_ids = torch.arange(hidden_states.shape[1], device=hidden_states.device).unsqueeze(0)
         return self.rotary_emb(hidden_states, position_ids)
 
+if __name__ == "__main__":
+    torch_net = NetTorch(config)
+    save_torch(torch_net.embed_tokens.weight, dir + 'weight')
+    # 前向传播
+    torch_output = torch_net(torch_input)
+    torch_sin, torch_cos = torch_output
 
-torch_net = NetTorch(config)
-save_torch(torch_net.embed_tokens.weight, dir + 'weight')
-# 前向传播
-torch_output = torch_net(torch_input)
-torch_sin, torch_cos = torch_output
-
-print("sin shape:", torch_sin.shape)
-print("sin:", torch_sin)
+    print("sin shape:", torch_sin.shape)
+    print("sin:", torch_sin)
 
-print("cos shape:", torch_cos.shape)
-print("cos:", torch_cos)
+    print("cos shape:", torch_cos.shape)
+    print("cos:", torch_cos)