namespace-Pt
/

activation-beacon-llama2-7b-chat

@@ -226,8 +226,35 @@ class LlamaMLP(nn.Module):
         self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
         self.act_fn = ACT2FN[config.hidden_act]
-    def forward(self, x):
         if self.config.pretraining_tp > 1:
             slice = self.intermediate_size // self.config.pretraining_tp
             gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
             up_proj_slices = self.up_proj.weight.split(slice, dim=0)
@@ -243,8 +270,28 @@ class LlamaMLP(nn.Module):
                 F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
             ]
             down_proj = sum(down_proj)
         else:
-            down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
         return down_proj
@@ -297,16 +344,20 @@ class LlamaAttention(nn.Module):
         self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
         self._init_rope()
-        # NOTE: add extra parameters for fold tokens
-        self.beacon_q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
-        self.beacon_k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
-        self.beacon_v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
-        self.beacon_o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
         # skip post initialization to speed up loading
-        self.beacon_q_proj._is_hf_initialized = True
-        self.beacon_k_proj._is_hf_initialized = True
-        self.beacon_v_proj._is_hf_initialized = True
-        self.beacon_o_proj._is_hf_initialized = True
     def _init_rope(self):
         if self.config.rope_scaling is None:
@@ -335,22 +386,33 @@ class LlamaAttention(nn.Module):
             else:
                 raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
-    def _init_beacon_proj(self):
-        """Initialize the fold projection weight with that of the ordinal projection."""
         if is_deepspeed_zero3_enabled():
             import deepspeed
             params = [self.beacon_q_proj.weight, self.beacon_k_proj.weight, self.beacon_v_proj.weight, self.beacon_o_proj.weight, self.q_proj.weight, self.k_proj.weight, self.v_proj.weight, self.o_proj.weight]
             with deepspeed.zero.GatheredParameters(params, modifier_rank=0):
                 self.beacon_q_proj.weight.data[:] = self.q_proj.weight.data
                 self.beacon_k_proj.weight.data[:] = self.k_proj.weight.data
                 self.beacon_v_proj.weight.data[:] = self.v_proj.weight.data
                 self.beacon_o_proj.weight.data[:] = self.o_proj.weight.data
-        else:
-            # only copy the value in-place, without tieing the weight
-            self.beacon_q_proj.weight.data[:] = self.q_proj.weight.data
-            self.beacon_k_proj.weight.data[:] = self.k_proj.weight.data
-            self.beacon_v_proj.weight.data[:] = self.v_proj.weight.data
-            self.beacon_o_proj.weight.data[:] = self.o_proj.weight.data
     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
@@ -360,17 +422,26 @@ class LlamaAttention(nn.Module):
             ordinal_hidden_states = hidden_states[:, :-beacon_size]
             beacon_hidden_states = hidden_states[:, -beacon_size:]
-            ordinal_query_states = self.q_proj(ordinal_hidden_states)
-            ordinal_key_states = self.k_proj(ordinal_hidden_states)
-            ordinal_value_states = self.v_proj(ordinal_hidden_states)
-            beacon_query_states = self.beacon_q_proj(beacon_hidden_states)
-            beacon_key_states = self.beacon_k_proj(beacon_hidden_states)
-            beacon_value_states = self.beacon_v_proj(beacon_hidden_states)
-            query_states = torch.cat([ordinal_query_states, beacon_query_states], dim=1)
-            key_states = torch.cat([ordinal_key_states, beacon_key_states], dim=1)
-            value_states = torch.cat([ordinal_value_states, beacon_value_states], dim=1)
         else:
             query_states = self.q_proj(hidden_states)
@@ -378,6 +449,18 @@ class LlamaAttention(nn.Module):
             value_states = self.v_proj(hidden_states)
         return query_states, key_states, value_states
     def forward(
         self,
@@ -403,8 +486,10 @@ class LlamaAttention(nn.Module):
         else:
             past_seq_len = 0
-        # TODO: support pretraining_tp
         if self.config.pretraining_tp > 1:
             key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
             query_slices = self.q_proj.weight.split(
                 (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
@@ -430,13 +515,14 @@ class LlamaAttention(nn.Module):
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         if past_key is not None:
             # reuse k, v, self_attention
             key_states = torch.cat([past_key, key_states], dim=2)
             value_states = torch.cat([past_value, value_states], dim=2)
-        # return keys and values before rope
-        past_key_value = (key_states, value_states, beacon_size, raw_size_to_cache, window_size)
         key_position_ids = position_ids
         # align query position_ids with key
@@ -480,16 +566,13 @@ class LlamaAttention(nn.Module):
         if self.config.pretraining_tp > 1:
             # TODO: support pretraining_tp
             attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
             o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
             attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
         else:
-            if beacon_size > 0:
-                regular_attn_output = self.o_proj(attn_output[:, :-beacon_size])
-                beacon_attn_output = self.beacon_o_proj(attn_output[:, -beacon_size:])
-                attn_output = torch.cat([regular_attn_output, beacon_attn_output], dim=1)
-            else:
-                attn_output = self.o_proj(attn_output)
         if not output_attentions:
             attn_weights = None
@@ -545,14 +628,15 @@ class LlamaSdpaAttention(LlamaAttention):
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         if past_key is not None:
             # reuse k, v, self_attention
             key_states = torch.cat([past_key, key_states], dim=2)
             value_states = torch.cat([past_value, value_states], dim=2)
-        # return keys and values before rope
-        past_key_value = (key_states, value_states, beacon_size, raw_size_to_cache, window_size)
         key_position_ids = position_ids
         # align query position_ids with key
         query_position_ids = key_position_ids[:, -q_len:]
@@ -588,13 +672,20 @@ class LlamaSdpaAttention(LlamaAttention):
         attn_output = attn_output.transpose(1, 2).contiguous()
         attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-        if beacon_size > 0:
-            regular_attn_output = self.o_proj(attn_output[:, :-beacon_size])
-            beacon_attn_output = self.beacon_o_proj(attn_output[:, -beacon_size:])
-            attn_output = torch.cat([regular_attn_output, beacon_attn_output], dim=1)
-        else:
-            attn_output = self.o_proj(attn_output)
         return attn_output, None, past_key_value
@@ -645,6 +736,9 @@ class LlamaDecoderLayer(nn.Module):
                 "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
             )
         residual = hidden_states
         hidden_states = self.input_layernorm(hidden_states)
@@ -664,7 +758,7 @@ class LlamaDecoderLayer(nn.Module):
         # Fully Connected
         residual = hidden_states
         hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
         hidden_states = residual + hidden_states
         outputs = (hidden_states,)
@@ -843,10 +937,6 @@ def compute_loss(logits, labels, shift=False):
     if (valid_token_num == 0).any():
         batch_loss = batch_loss.masked_fill(valid_token_num == 0, 0.)
-    # print("beacon")
-    # print(f"token_loss: {token_loss[:, :100].tolist()}")
-    # print(f"batch_loss: {batch_loss}")
-    # input()
     return loss, batch_loss, valid_token_num
 @dataclass
@@ -895,7 +985,7 @@ class LlamaModel(LlamaPreTrainedModel):
         self.post_init()
     def _init_beacon_embed(self):
-        """Initialize the fold token embedding with that of the eos token."""
         if is_deepspeed_zero3_enabled():
             import deepspeed
             params = [self.beacon_embed_tokens.weight, self.embed_tokens.weight]
@@ -1109,6 +1199,15 @@ class LlamaModel(LlamaPreTrainedModel):
         hidden_states = self.norm(hidden_states)
         # add hidden states from the last decoder layer
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
@@ -1139,9 +1238,6 @@ class LlamaForCausalLM(LlamaPreTrainedModel):
     def set_memory(self):
         config: LlamaConfig = self.config
-        info = f"applying activation beacon on {'all' if config.beacon_layers is None else config.beacon_layers} layers, with window size {config.beacon_window}, stride {config.beacon_stride} (mixed by {config.beacon_stride_mix}), {config.beacon_attn} attention, and condensing ratio {config.beacon_ratio} (mixed by {config.beacon_ratio_mix}), seed {config.beacon_seed}..."
-        logger.info(info)
         self.memory = Memory(
             model_config=config,
             beacon_window=config.beacon_window,
@@ -1151,10 +1247,11 @@ class LlamaForCausalLM(LlamaPreTrainedModel):
             beacon_ratio=config.beacon_ratio,
             beacon_stride_mix=config.beacon_stride_mix,
             beacon_ratio_mix=config.beacon_ratio_mix,
-            beacon_seed=config.beacon_seed,
-            beacon_layers=config.beacon_layers,
             k_seq_dim=2,
             v_seq_dim=2,
         )
     def get_input_embeddings(self):
@@ -1180,15 +1277,26 @@ class LlamaForCausalLM(LlamaPreTrainedModel):
         """Override the default from_pretrained to extend vocab size according to beacon_size."""
         model, loading_info = super().from_pretrained(*args, **kwargs, output_loading_info=True)
         missing_keys = loading_info["missing_keys"]
-        # only initialize weights when they are missing from the checkpoint
-        if any("beacon_embed_tokens" in missing_key for missing_key in missing_keys):
-            # initialize weights of embedding layers for fold tokens
-            model.model._init_beacon_embed()
-        if any("beacon_q_proj" in missing_key for missing_key in missing_keys):
-            # initialize weights of linear layers for fold tokens
-            for layer in model.model.layers:
-                if hasattr(layer.self_attn, "_init_beacon_proj"):
-                    layer.self_attn._init_beacon_proj()
         return model
     def _native_forward(
@@ -1397,7 +1505,7 @@ def evaluate_perplexity(model, dataloader, accelerator:Optional[Accelerator]=Non
         # NOTE: we need the loss for each element in the batch for accurate computation, because the number of valid tokens may differ among elements
         if hasattr(output, "batch_loss"):
-            # output from Fold-Llama has batch_loss by default
             batch_loss = output.batch_loss
             valid_token_num = output.valid_token_num
         else:
@@ -1415,9 +1523,10 @@ def evaluate_perplexity(model, dataloader, accelerator:Optional[Accelerator]=Non
             all_loss[_id].append((_loss * _num, _num))
     for _id, loss_and_num in all_loss.items():
-        # sum up the loss for all valid tokens, and divide the number of valid tokens
         all_loss[_id] = sum([x[0] for x in loss_and_num]) / sum(x[1] for x in loss_and_num)
     perplexity = math.exp(sum(all_loss.values()) / len(all_loss))
     return perplexity

         self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
         self.act_fn = ACT2FN[config.hidden_act]
+        if "mlp" in config.beacon_param:
+            self.beacon_up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+            self.beacon_down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+            self.beacon_up_proj._is_hf_initialized = True
+            self.beacon_down_proj._is_hf_initialized = True
+    def _init_beacon_proj(self, beacon_param=None):
+        """Initialize the beacon projection weight with that of the ordinal projection."""
+        if beacon_param is None:
+            beacon_param = self.config.beacon_param
+        if is_deepspeed_zero3_enabled():
+            import deepspeed
+            params = [self.up_proj, self.down_proj, self.beacon_up_proj, self.beacon_down_proj]
+            with deepspeed.zero.GatheredParameters(params, modifier_rank=0):
+                if "mlp" in beacon_param:
+                    self.beacon_up_proj.weight.data[:] = self.up_proj.weight.data
+                    self.beacon_down_proj.weight.data[:] = self.down_proj.weight.data
+        else:
+            # only copy the value in-place, without tieing the weight
+            if "mlp" in beacon_param:
+                self.beacon_up_proj.weight.data[:] = self.up_proj.weight.data
+                self.beacon_down_proj.weight.data[:] = self.down_proj.weight.data
+    def forward(self, x, beacon_size):
         if self.config.pretraining_tp > 1:
+            # TODO: support pretraining_tp
+            raise NotImplementedError
             slice = self.intermediate_size // self.config.pretraining_tp
             gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
             up_proj_slices = self.up_proj.weight.split(slice, dim=0)
                 F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
             ]
             down_proj = sum(down_proj)
         else:
+            if "mlp" in self.config.beacon_param:
+                if beacon_size > 0:
+                    ordinal_hidden_states = x[:, :-beacon_size]
+                    beacon_hidden_states = x[:, -beacon_size:]
+                    # ordinal_up_proj = self.up_proj(ordinal_hidden_states)
+                    # beacon_up_proj = self.beacon_up_proj(beacon_hidden_states)
+                    # up_proj = torch.cat([ordinal_up_proj, beacon_up_proj], dim=1)
+                    # intermediate = self.act_fn(self.gate_proj(x)) * up_proj
+                    # ordinal_down_proj = self.down_proj(intermediate[:, :-beacon_size])
+                    # beacon_down_proj = self.beacon_down_proj(intermediate[:, -beacon_size:])
+                    # down_proj = torch.cat([ordinal_down_proj, beacon_down_proj], dim=1)
+                    ordinal_down_proj = self.down_proj(self.act_fn(self.gate_proj(ordinal_hidden_states)) * self.up_proj(ordinal_hidden_states))
+                    beacon_down_proj = self.beacon_down_proj(self.act_fn(self.gate_proj(beacon_hidden_states)) * self.beacon_up_proj(beacon_hidden_states))
+                    down_proj = torch.cat([ordinal_down_proj, beacon_down_proj], dim=1)
+                else:
+                    down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+            else:
+                down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
         return down_proj
         self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
         self._init_rope()
+        # NOTE: add extra parameters for beacon tokens
         # skip post initialization to speed up loading
+        if "q" in config.beacon_param:
+            self.beacon_q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+            self.beacon_q_proj._is_hf_initialized = True
+        if "k" in config.beacon_param:
+            self.beacon_k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+            self.beacon_k_proj._is_hf_initialized = True
+        if "v" in config.beacon_param:
+            self.beacon_v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+            self.beacon_v_proj._is_hf_initialized = True
+        if "o" in config.beacon_param:
+            self.beacon_o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
+            self.beacon_o_proj._is_hf_initialized = True
     def _init_rope(self):
         if self.config.rope_scaling is None:
             else:
                 raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+    def _init_beacon_proj(self, beacon_param=None):
+        """Initialize the beacon projection weight with that of the ordinal projection."""
+        if beacon_param is None:
+            beacon_param = self.config.beacon_param
         if is_deepspeed_zero3_enabled():
             import deepspeed
             params = [self.beacon_q_proj.weight, self.beacon_k_proj.weight, self.beacon_v_proj.weight, self.beacon_o_proj.weight, self.q_proj.weight, self.k_proj.weight, self.v_proj.weight, self.o_proj.weight]
             with deepspeed.zero.GatheredParameters(params, modifier_rank=0):
+                if "q" in beacon_param:
+                    self.beacon_q_proj.weight.data[:] = self.q_proj.weight.data
+                if "k" in beacon_param:
+                    self.beacon_k_proj.weight.data[:] = self.k_proj.weight.data
+                if "v" in beacon_param:
+                    self.beacon_v_proj.weight.data[:] = self.v_proj.weight.data
+                if "o" in beacon_param:
+                    self.beacon_o_proj.weight.data[:] = self.o_proj.weight.data
+        else:
+            # only copy the value in-place, without tieing the weight
+            if "q" in beacon_param:
                 self.beacon_q_proj.weight.data[:] = self.q_proj.weight.data
+            if "k" in beacon_param:
                 self.beacon_k_proj.weight.data[:] = self.k_proj.weight.data
+            if "v" in beacon_param:
                 self.beacon_v_proj.weight.data[:] = self.v_proj.weight.data
+            if "o" in beacon_param:
                 self.beacon_o_proj.weight.data[:] = self.o_proj.weight.data
     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
             ordinal_hidden_states = hidden_states[:, :-beacon_size]
             beacon_hidden_states = hidden_states[:, -beacon_size:]
+            if "q" in self.config.beacon_param:
+                ordinal_query_states = self.q_proj(ordinal_hidden_states)
+                beacon_query_states = self.beacon_q_proj(beacon_hidden_states)
+                query_states = torch.cat([ordinal_query_states, beacon_query_states], dim=1)
+            else:
+                query_states = self.q_proj(hidden_states)
+            if "k" in self.config.beacon_param:
+                ordinal_key_states = self.k_proj(ordinal_hidden_states)
+                beacon_key_states = self.beacon_k_proj(beacon_hidden_states)
+                key_states = torch.cat([ordinal_key_states, beacon_key_states], dim=1)
+            else:
+                key_states = self.k_proj(hidden_states)
+            if "v" in self.config.beacon_param:
+                ordinal_value_states = self.v_proj(ordinal_hidden_states)
+                beacon_value_states = self.beacon_v_proj(beacon_hidden_states)
+                value_states = torch.cat([ordinal_value_states, beacon_value_states], dim=1)
+            else:
+                value_states = self.v_proj(hidden_states)
         else:
             query_states = self.q_proj(hidden_states)
             value_states = self.v_proj(hidden_states)
         return query_states, key_states, value_states
+    def o_proj_with_beacon(self, attn_output, beacon_size=0):
+        if beacon_size > 0:
+            if "o" in self.config.beacon_param:
+                ordinal_attn_output = self.o_proj(attn_output[:, :-beacon_size])
+                beacon_attn_output = self.beacon_o_proj(attn_output[:, -beacon_size:])
+                attn_output = torch.cat([ordinal_attn_output, beacon_attn_output], dim=1)
+            else:
+                attn_output = self.o_proj(attn_output)
+        else:
+            attn_output = self.o_proj(attn_output)
+        return attn_output
     def forward(
         self,
         else:
             past_seq_len = 0
         if self.config.pretraining_tp > 1:
+            # TODO: support pretraining_tp
+            raise NotImplementedError
             key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
             query_slices = self.q_proj.weight.split(
                 (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        # return keys and values before rope
+        # NOTE: incrementally return keys and values for efficiency
+        past_key_value = (key_states, value_states, beacon_size, raw_size_to_cache, window_size)
         if past_key is not None:
             # reuse k, v, self_attention
             key_states = torch.cat([past_key, key_states], dim=2)
             value_states = torch.cat([past_value, value_states], dim=2)
         key_position_ids = position_ids
         # align query position_ids with key
         if self.config.pretraining_tp > 1:
             # TODO: support pretraining_tp
+            raise NotImplementedError
             attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
             o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
             attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
         else:
+            attn_output = self.o_proj_with_beacon(attn_output, beacon_size)
         if not output_attentions:
             attn_weights = None
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        # return keys and values before rope
+        # NOTE: incrementally return keys and values for efficiency
+        past_key_value = (key_states, value_states, beacon_size, raw_size_to_cache, window_size)
         if past_key is not None:
             # reuse k, v, self_attention
             key_states = torch.cat([past_key, key_states], dim=2)
             value_states = torch.cat([past_value, value_states], dim=2)
         key_position_ids = position_ids
         # align query position_ids with key
         query_position_ids = key_position_ids[:, -q_len:]
         attn_output = attn_output.transpose(1, 2).contiguous()
         attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj_with_beacon(attn_output, beacon_size)
+        # for debug
+        # if torch.distributed.get_rank() == 4 and self.layer_idx == 0:
+        #     torch.save({
+        #         "hidden_states": hidden_states,
+        #         "past_key_value": past_key_value,
+        #         "query_states": query_states,
+        #         "key_states": key_states,
+        #         "value_states": value_states,
+        #         "attn_output": attn_output,
+        #         "attention_mask": attention_mask,
+        #         "key_position_ids": key_position_ids,
+        #     }, "beacon_llama_layer_0")
         return attn_output, None, past_key_value
                 "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
             )
+        # NOTE: get beacon_size in case the mlp is included in beacon_param
+        past_key, past_value, beacon_size, raw_size_to_cache, window_size = past_key_value
         residual = hidden_states
         hidden_states = self.input_layernorm(hidden_states)
         # Fully Connected
         residual = hidden_states
         hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states, beacon_size)
         hidden_states = residual + hidden_states
         outputs = (hidden_states,)
     if (valid_token_num == 0).any():
         batch_loss = batch_loss.masked_fill(valid_token_num == 0, 0.)
     return loss, batch_loss, valid_token_num
 @dataclass
         self.post_init()
     def _init_beacon_embed(self):
+        """Initialize the beacon token embedding with that of the eos token."""
         if is_deepspeed_zero3_enabled():
             import deepspeed
             params = [self.beacon_embed_tokens.weight, self.embed_tokens.weight]
         hidden_states = self.norm(hidden_states)
+        # for debug
+        # if torch.distributed.get_rank() == 4:
+        #     torch.save({
+        #         "hidden_states": hidden_states,
+        #         "past_key_values": past_key_values,
+        #         "attention_mask": attention_mask,
+        #         "position_ids": position_ids,
+        #     }, "beacon_llama_inputs")
         # add hidden states from the last decoder layer
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
     def set_memory(self):
         config: LlamaConfig = self.config
         self.memory = Memory(
             model_config=config,
             beacon_window=config.beacon_window,
             beacon_ratio=config.beacon_ratio,
             beacon_stride_mix=config.beacon_stride_mix,
             beacon_ratio_mix=config.beacon_ratio_mix,
+            beacon_param=config.beacon_param,
             k_seq_dim=2,
             v_seq_dim=2,
+            retrieval_method=config.retrieval_method,
+            retrieval_topk=config.retrieval_topk,
         )
     def get_input_embeddings(self):
         """Override the default from_pretrained to extend vocab size according to beacon_size."""
         model, loading_info = super().from_pretrained(*args, **kwargs, output_loading_info=True)
         missing_keys = loading_info["missing_keys"]
+        # only initialize beacon weights when they are missing from the checkpoint
+        beacon_param = set()
+        for missing_key in missing_keys:
+            if "beacon_embed_tokens" in missing_key:
+                model.model._init_beacon_embed()
+            elif "beacon_q_proj" in missing_key:
+                beacon_param.add("q")
+            elif "beacon_k_proj" in missing_key:
+                beacon_param.add("k")
+            elif "beacon_v_proj" in missing_key:
+                beacon_param.add("v")
+            elif "beacon_o_proj" in missing_key:
+                beacon_param.add("o")
+            elif "beacon_up_proj" in missing_key:
+                beacon_param.add("mlp")
+        # initialize weights of possible q,k,v,o,mlp
+        for layer in model.model.layers:
+            layer.self_attn._init_beacon_proj(beacon_param)
+            layer.mlp._init_beacon_proj(beacon_param)
         return model
     def _native_forward(
         # NOTE: we need the loss for each element in the batch for accurate computation, because the number of valid tokens may differ among elements
         if hasattr(output, "batch_loss"):
+            # output from our model has batch_loss by default
             batch_loss = output.batch_loss
             valid_token_num = output.valid_token_num
         else:
             all_loss[_id].append((_loss * _num, _num))
     for _id, loss_and_num in all_loss.items():
+        # sum up the loss for all valid tokens in the entire sequence, and divide the number of valid tokens
         all_loss[_id] = sum([x[0] for x in loss_and_num]) / sum(x[1] for x in loss_and_num)
+    # average across then take exp
     perplexity = math.exp(sum(all_loss.values()) / len(all_loss))
     return perplexity