activation-beacon-llama2-7b-chat / modeling_beacon.py

Upload modeling_beacon.py with huggingface_hub

cb864b5 verified 8 months ago

No virus

26.6 kB

	import torch
	import numpy as np
	import torch.distributed as dist
	from transformers.utils import logging
	from typing import List, Tuple, Optional
	from .modeling_retrieval import BM25Retriever

	logger = logging.get_logger(__name__)


	class Memory(torch.nn.Module):
	def __init__(self, model_config, beacon_window:int=1024, beacon_stride:List[int]=[512], beacon_attn:str="step-expansion", beacon_attend_previous:bool=True, beacon_ratio:List[int]=[8], beacon_stride_mix:str="step-random", beacon_ratio_mix:str="step-random", beacon_param:List[str]=["q", "k", "v", "o"], k_seq_dim:int=2, v_seq_dim:int=2, retrieval_method:str=None, retrieval_topk:int=2) -> None:
	super().__init__()

	for stride in beacon_stride:
	assert beacon_window >= stride, f"Make sure the beacon_window {beacon_window} >= beacon_stride {stride}!"
	assert beacon_attn in ["segmentation", "step-expansion", "full-coverage"], f"beacon_attn {beacon_attn} not implemented!"
	assert beacon_stride_mix in ["instance-random", "step-random", "mix-random"], f"beacon_stride_mix {beacon_stride_mix} not implemented!"
	assert beacon_ratio_mix in ["instance-random", "step-random", "mix-random", "sequence"] or "adapt-" in beacon_ratio_mix, f"beacon_ratio_mix {beacon_ratio_mix} not implemented!"

	if retrieval_method == "bm25":
	assert len(beacon_stride) == 1, f"Currently retrieval do not support dynamic strides."
	assert retrieval_topk >= 2, f"Make sure retrieval_topk >= 2. Found {retrieval_topk}."
	assert len(beacon_ratio) == 2, f"Make sure there are two beacon ratios specified, one for retrieved windows and the other for non-retrieved windows. Found {self.beacon_ratio}"

	info = f"applying activation beacon on {beacon_param}, with window size {beacon_window}, stride {beacon_stride} (mixed by {beacon_stride_mix}), {beacon_attn} attention ({'attending to previous beacons' if beacon_attend_previous else 'not attending to previous beacons'}), condensing ratio {beacon_ratio} (mixed by {beacon_ratio_mix}), {retrieval_method+' retrieval'+' top-'+str(retrieval_topk) if retrieval_method is not None else 'no retrieval'}, ..."
	logger.info(info)

	self.beacon_window = beacon_window
	self.beacon_stride = beacon_stride
	self.beacon_attn = beacon_attn
	self.beacon_ratio = beacon_ratio
	self.beacon_stride_mix = beacon_stride_mix
	self.beacon_ratio_mix = beacon_ratio_mix
	max_beacon_size = max([beacon_window // x for x in beacon_ratio if x > 0] + [1])
	self.beacon_tokens = torch.zeros(max_beacon_size, dtype=torch.long) + model_config.vocab_size

	# initialize necessary parameters
	self.k_seq_dim = k_seq_dim
	self.v_seq_dim = v_seq_dim
	self.num_layers = model_config.num_hidden_layers
	self.max_position_embeddings = model_config.max_position_embeddings

	self.retrieval_method = retrieval_method
	self.retrieval_topk = retrieval_topk

	self.rng = np.random.default_rng(42)
	self.reset()

	@property
	def finish(self):
	return self.end_idx == self.sequence_length

	def get_memory_size(self):
	beacon_memory_size = 0
	raw_memory_size = 0
	if self.beacon_activations[0][0] is not None:
	beacon_memory_size += self.beacon_activations[0][0].shape[self.k_seq_dim]
	if self.raw_activations[0][0] is not None:
	raw_memory_size += self.raw_activations[0][0].shape[self.k_seq_dim]
	memory_size = beacon_memory_size + raw_memory_size
	return beacon_memory_size, raw_memory_size, memory_size

	def reset(self):
	# the length of current sequence
	self.sequence_length = 0
	# the length of all sequences until the memory is reset
	self.total_sequence_length = 0
	# the cursor pointing to the start of the current window
	self.start_idx = 0
	# the cursor pointing to the end of the current window
	self.end_idx = 0
	# the beacon sizes of all strides
	self._beacon_sizes = []
	# the step index
	self.step_idx = 0

	if self.beacon_ratio_mix != "step-random":
	self._stride = None
	self._ratio = None

	self.batch_loss = None
	self.valid_token_num = None

	self.raw_activations = [(None, None) for _ in range(self.num_layers)]
	self.beacon_activations = [(None, None) for _ in range(self.num_layers)]

	if self.retrieval_method == "bm25":
	self.retriever = BM25Retriever()
	self.beacon_ratio_mix = "retrieval"

	# NOTE: when training, we strictly aligh the rng_state across processes
	if self.training and dist.is_initialized():
	rng_state = self.rng.__getstate__()
	if dist.get_rank() == 0:
	obj = [rng_state]
	else:
	obj = [None]
	dist.broadcast_object_list(obj, src=0)
	self.rng.__setstate__(obj[0])

	def prepare(self, input_ids, attention_mask, labels):
	"""
	Prepare inputs for the model.
	"""
	# TODO: support batch_size > 1?
	assert input_ids.shape[0] == 1, f"Make sure batch_size is 1!"

	# NOTE: rebase the start/end idx so that it becomes an offset from the start of the current sequence
	self.start_idx -= self.sequence_length
	self.end_idx -= self.sequence_length

	self.sequence_length = input_ids.shape[1]
	self.total_sequence_length += input_ids.shape[1]

	if labels is not None:
	# rotate labels in advance so that the loss of the last token is not ignored in every window
	labels = torch.cat([labels[:, 1:], labels.new_zeros((labels.shape[0], 1)) - 100], dim=-1)

	# if the current sequence has been completely processed
	self.input_ids = input_ids
	self.attention_mask = attention_mask
	self.labels = labels

	# TODO: retrieval on specified steps
	# TODO: retrieval for future inputs
	# TODO: different retrieval methods
	if self.retrieval_method == "bm25" and self.step_idx == 0:
	index = BM25Retriever()
	window = self.beacon_window
	stride = self.beacon_stride[0]

	input_ids = input_ids[0].tolist()
	corpus = [input_ids[:window]]
	for j in range(window, len(input_ids), stride):
	corpus.append(input_ids[j: j + stride])

	# NOTE: use the last 32 token as query (very naive heuristics)
	query = input_ids[-32:]
	index.index(corpus)
	topk_scores, topk_indices = index.search(query, hits=self.retrieval_topk)
	topk_indices = set([x for x in topk_indices[0] if x > -1])

	self._topk_indices = topk_indices

	def set_stride(self):
	"""Choose a stride from self.beacon_stride"""
	beacon_stride = self.beacon_stride

	if len(beacon_stride) == 1:
	return beacon_stride[0]

	if self.beacon_stride_mix == "mix-random":
	stride_mix = self.rng.choice(["instance-random", "step-random"]).tolist()
	else:
	stride_mix = self.beacon_stride_mix

	if stride_mix == "instance-random":
	if self._beacon_stride is None:
	stride = self.rng.choice(beacon_stride).tolist()
	self._stride = stride
	else:
	stride = self._stride

	elif stride_mix == "step-random":
	stride = self.rng.choice(beacon_stride).tolist()

	else:
	raise NotImplementedError

	return stride

	def set_condensing_ratio(self, beacon_stride, start_idx, end_idx):
	"""Choose a condensing ratio from self.beacon_ratio"""
	def filter_ratio(ratios, stride):
	valid_ratios = []
	for ratio in ratios:
	# stride must be bigger than condensing ratio because we there must be at least one beacon
	if stride < ratio:
	continue
	# step-expansion and segmentation requires the stride to be evenly divisible by condensing ratio
	if self.beacon_attn != "full-coverage" and ratio > 0 and (stride % ratio) != 0:
	continue
	# when training, ratio=0 is valid if previous windows contain beacon or later windows contain beacon
	if ratio == 0 and self.training:
	previous_beacons = [b for b in self._beacon_sizes if b != -1]
	following_beacons = (start_idx + stride + self.beacon_window) <= self.sequence_length
	if len(previous_beacons) == 0 and not following_beacons:
	continue
	valid_ratios.append(ratio)
	assert len(valid_ratios), f"Cannot find valid condensing ratio (among {ratios}) for stride {stride}!"
	return valid_ratios

	def get_max_length(ratios):
	max_lengths = []
	for condensing_ratio in ratios:
	if condensing_ratio > 0:
	max_lengths.append((self.max_position_embeddings - self.beacon_window) * condensing_ratio + self.beacon_window)
	else:
	max_lengths.append(self.max_position_embeddings)
	return max_lengths

	if len(self.beacon_ratio) == 1:
	return self.beacon_ratio[0]

	beacon_ratio = filter_ratio(self.beacon_ratio, beacon_stride)

	if self.beacon_ratio_mix == "mix-random":
	ratio_mix = self.rng.choice(["instance-random", "step-random"]).tolist()
	else:
	ratio_mix = self.beacon_ratio_mix

	if ratio_mix == "instance-random":
	if self._ratio is None:
	beacon_ratio = self.rng.choice(beacon_ratio).tolist()
	self._ratio = beacon_ratio
	else:
	beacon_ratio = self._ratio

	elif ratio_mix == "step-random":
	beacon_ratio = self.rng.choice(beacon_ratio).tolist()

	elif ratio_mix == "sequence":
	idx = min(self.step_idx, len(beacon_ratio) - 1)
	beacon_ratio = beacon_ratio[idx]

	elif ratio_mix == "retrieval":
	# for retrieved windows, we use low ratio; otherwise high ratio
	if self.step_idx in self._topk_indices:
	beacon_ratio = min(self.beacon_ratio)
	else:
	beacon_ratio = max(self.beacon_ratio)

	elif "adapt" in ratio_mix:
	if self._ratio is None:
	future_length = int(ratio_mix.split("-")[1])
	sequence_length = self.total_sequence_length + future_length
	max_lengths = get_max_length(beacon_ratio)
	# ascendingly sort the max lengths
	valid_max_lengths_and_indices = [x for x in enumerate(max_lengths) if x[1] >= sequence_length]
	if len(valid_max_lengths_and_indices):
	minimum_length_index = min(valid_max_lengths_and_indices, key=lambda x: x[1])[0]
	# use the minimal possible length for this sequence (the smallest fold ratio)
	beacon_ratio = beacon_ratio[minimum_length_index]
	else:
	beacon_ratio = max(beacon_ratio)
	# logger.warning(f"Failed to find valid fold window and size for sequence length {sequence_length}, as the maximum theoretical length is {max(max_lengths)}. Fall back to use the maximum one: {beacon_ratio}.")
	self._ratio = beacon_ratio
	else:
	beacon_ratio = self._ratio

	return beacon_ratio

	def step(self):
	"""
	Yield one window with the following logic:

	The window size is L, the stride is S.
	The window moves over S tokens at a time. The raw activations passed by the window are condensed according to a condensing_ratio.
	The beacons are added if and only if the raw activations fulfill the window.
	In the future, we may switch window size to decrease cache size of raw activations.
	"""
	# the starting position of the current window w.r.t. the start of the current input sequence
	start_idx = self.start_idx
	# the end position of the current window w.r.t. the start of the current input sequence
	end_idx = start_idx + self.beacon_window

	# indicates if the current window is completely filled by raw activations and new tokens
	# we only append beacon tokens for full windows
	is_full_window = True
	if end_idx > self.sequence_length:
	# the input is shorter than the initial window size
	end_idx = self.sequence_length
	is_full_window = False

	# the real window size (remaining_size + new_token_size)
	window_size = end_idx - start_idx

	if is_full_window:
	# set stride and condensing ratio
	beacon_stride = self.set_stride()
	condensing_ratio = self.set_condensing_ratio(beacon_stride, start_idx=start_idx, end_idx=end_idx)

	# the stride must be evenly divisible by condensing_ratio
	if condensing_ratio > 0:
	beacon_size = beacon_stride // condensing_ratio
	else:
	# the raw activations are used as beacon activations
	beacon_size = -1
	# forward start_idx and end_idx
	next_start_idx = start_idx + beacon_stride
	# how many raw activations to save
	raw_size_to_cache = end_idx - next_start_idx
	self.remaining_size = 0
	else:
	# no stride because the sequence has finished
	next_start_idx = start_idx
	# cache all recent raw activations to be used in the next window
	raw_size_to_cache = window_size
	self.remaining_size = window_size
	beacon_size = 0

	# this is for debugging the resilience to different beacons
	# if self.step_idx == 97:
	# a = torch.load("beacon_activations")
	# for i, (beacon_key, beacon_value) in enumerate(self.beacon_activations):
	# foreign_beacon_key = a[i][0]
	# foreign_beacon_value = a[i][1]
	# new_beacon_key = cat_tensor([foreign_beacon_key, slice_tensor(beacon_key, start=16, dim=self.k_seq_dim)], dim=self.k_seq_dim)
	# new_beacon_value = cat_tensor([foreign_beacon_value, slice_tensor(beacon_value, start=16, dim=self.v_seq_dim)], dim=self.v_seq_dim)
	# new_beacon_key = foreign_beacon_key
	# new_beacon_value = foreign_beacon_value
	# self.beacon_activations[i] = (new_beacon_key, new_beacon_value)

	# streamingly add new input_ids
	input_ids = self.input_ids[:, self.end_idx: end_idx]
	batch_size = input_ids.shape[0]
	if self.attention_mask is not None:
	attention_mask = self.attention_mask[:, self.end_idx: end_idx]
	else:
	attention_mask = torch.ones_like(input_ids)
	if self.labels is not None:
	labels = self.labels[:, self.end_idx: end_idx]
	else:
	labels = None
	# prepend 1 to attention mask for previous memory
	_, _, memory_size = self.get_memory_size()
	if memory_size > 0:
	attention_mask = torch.cat([attention_mask.new_ones(batch_size, memory_size), attention_mask], dim=1)

	# append beacons if necessary
	if is_full_window and beacon_size > 0:
	input_ids = torch.cat([input_ids, self.beacon_tokens[:beacon_size].expand(batch_size, -1).to(input_ids.device, dtype=input_ids.dtype)], dim=1)
	# NOTE: prepend beacon_memory_size 1 to attention_mask because we have past_key_values
	attention_mask = torch.cat([attention_mask, attention_mask.new_ones(batch_size, beacon_size)], dim=1)
	if labels is not None:
	labels = torch.cat([labels, labels.new_zeros(batch_size, beacon_size) - 100], dim=1)

	# generate memory (memory_length = old_beacon_size + beacon_size * condensing_ratio + raw_cache_size)
	past_key_values = []
	for (beacon_key, beacon_value), (raw_key, raw_value) in zip(self.beacon_activations, self.raw_activations):
	key = cat_tensor([beacon_key, raw_key], dim=self.k_seq_dim)
	value = cat_tensor([beacon_value, raw_value], dim=self.v_seq_dim)
	layer_past_key_values = (key, value, beacon_size, raw_size_to_cache, window_size)
	past_key_values.append(layer_past_key_values)

	# involked in self.output()
	self._beacon_sizes.append(beacon_size)
	# update end_idx
	self.start_idx = next_start_idx
	self.end_idx = end_idx
	self.step_idx += 1

	# print("****************************************")
	# if is_full_window:
	# print(f"total_seq_len: {self.total_sequence_length}")
	# print(f"stride: {beacon_stride}")
	# print(f"condensing ratio: {condensing_ratio}")
	# print(f"beacon_size: {beacon_size}")
	# print(f"input_ids: {input_ids.shape}")
	# print(f"start_idx: {start_idx}")
	# print(f"next_start_idx: {next_start_idx}")
	# print(f"end_idx: {end_idx}")
	# x = input()
	# if x == "s":
	# return
	# if self.step_idx == 3:
	# input()

	return input_ids, attention_mask, past_key_values, labels

	def update_memory(self, past_key_values):
	"""
	Accumulate beacon activations and raw activations.
	"""
	for layer_idx, (key, value, beacon_size, raw_size_to_cache, window_size) in enumerate(past_key_values):
	# NOTE: the past_key_values are incrementally returned (only the new keys and values are returned)

	# key/value: (num_layer, 2, batch_size, num_head, new_seq_len, head_dim)
	# beacon_size: how many beacon activations are in key and value
	# raw_size_to_cache: how many raw activations should be kept

	previous_beacon_key, previous_beacon_value = self.beacon_activations[layer_idx]
	previous_raw_key, previous_raw_value = self.raw_activations[layer_idx]

	if beacon_size == 0:
	# this means the current input does not fulfill a window
	# thus, the key and value are all raw activations, and we accumulate them until the window is fulfilled
	beacon_key = previous_beacon_key
	beacon_value = previous_beacon_value

	assert raw_size_to_cache == window_size
	raw_key = cat_tensor([
	previous_raw_key,
	key
	], dim=self.k_seq_dim)
	raw_value = cat_tensor([
	previous_raw_value,
	value
	], dim=self.v_seq_dim)

	elif beacon_size == -1:
	# this means the raw activations are used as beacon activations for this window

	if raw_size_to_cache > 0:
	# if we have raw activations, we must first concatenate previous raw activations and current ones, then extract raw_size_to_cache as raw memory, while others as beacon memory
	concat_key = cat_tensor([
	previous_raw_key,
	key
	], dim=self.k_seq_dim)
	concat_value = cat_tensor([
	previous_raw_value,
	value
	], dim=self.v_seq_dim)

	beacon_key = cat_tensor([
	previous_beacon_key,
	slice_tensor(concat_key, end=-raw_size_to_cache, dim=self.k_seq_dim)
	], dim=self.k_seq_dim)
	beacon_value = cat_tensor([
	previous_beacon_value,
	slice_tensor(concat_value, end=-raw_size_to_cache, dim=self.v_seq_dim)
	], dim=self.v_seq_dim)
	raw_key = slice_tensor(concat_key, start=-raw_size_to_cache, dim=self.k_seq_dim)
	raw_value = slice_tensor(concat_value, start=-raw_size_to_cache, dim=self.v_seq_dim)

	else:
	# if we donot have raw activations, this means stride==window. we put all into beacon memory
	beacon_key = cat_tensor([
	previous_beacon_key,
	key,
	], dim=self.k_seq_dim)
	beacon_value = cat_tensor([
	previous_beacon_value,
	value,
	], dim=self.v_seq_dim)
	raw_key = None
	raw_value = None

	else:
	# [-beacon_size:] activations are from beacons, need to be accumulated
	# [-raw_cache_size-beacon_size:-beacon_size] raw activations will be cached; if they are shorter than raw_cache_size, part of the previous raw activations will also be kept

	beacon_key = cat_tensor([
	previous_beacon_key,
	slice_tensor(key, start=-beacon_size, dim=self.k_seq_dim)
	], dim=self.k_seq_dim)
	beacon_value = cat_tensor([
	previous_beacon_value,
	slice_tensor(value, start=-beacon_size, dim=self.v_seq_dim)
	], dim=self.v_seq_dim)

	if key.shape[self.k_seq_dim] < raw_size_to_cache + beacon_size:
	concat_raw_key = cat_tensor([
	previous_raw_key,
	slice_tensor(key, end=-beacon_size, dim=self.k_seq_dim)
	], dim=self.k_seq_dim)
	concat_raw_value = cat_tensor([
	previous_raw_value,
	slice_tensor(value, end=-beacon_size, dim=self.v_seq_dim)
	], dim=self.v_seq_dim)
	raw_key = slice_tensor(concat_raw_key, start=-raw_size_to_cache, dim=self.k_seq_dim)
	raw_value = slice_tensor(concat_raw_value, start=-raw_size_to_cache, dim=self.v_seq_dim)
	else:
	# becomes None when raw_size_to_cache = 0
	raw_key = slice_tensor(key, start=-raw_size_to_cache - beacon_size, end=-beacon_size, dim=self.k_seq_dim)
	raw_value = slice_tensor(value, start=-raw_size_to_cache - beacon_size, end=-beacon_size, dim=self.v_seq_dim)

	self.beacon_activations[layer_idx] = (beacon_key, beacon_value)
	self.raw_activations[layer_idx] = (raw_key, raw_value)

	# NOTE: this is for debugging the resilience to different beacons
	# if self.step_idx == 2:
	# print(self.get_memory_size())
	# torch.save(self.beacon_activations, "beacon_activations")

	def update_loss(self, batch_loss, valid_token_num):
	"""
	Accumulate loss for later perplexity computation and backward pass; past_key_values according to cache_method.
	"""
	if self.batch_loss is None:
	# NOTE: multiply valid_token_num because batch_loss is divided by it in advance
	self.batch_loss = batch_loss * valid_token_num
	self.valid_token_num = valid_token_num
	else:
	# NOTE: avoid in-place operations, otherwise there will be gradient errors in training
	self.batch_loss = self.batch_loss + batch_loss * valid_token_num
	self.valid_token_num = self.valid_token_num + valid_token_num

	def output(self, model_outputs):
	"""
	Override loss with accumulated loss.
	"""
	# override loss
	if self.batch_loss is not None:
	# here the batch_loss is the summation of all token losses in each element
	loss = self.batch_loss.sum() / self.valid_token_num.sum()

	# NOTE: prevent nan
	batch_loss = self.batch_loss / self.valid_token_num
	if (self.valid_token_num == 0).any():
	batch_loss = batch_loss.masked_fill(self.valid_token_num == 0, 0.)

	# NOTE: we must use dict to override values, otherwise trainer cannot find loss
	model_outputs["loss"] = loss
	model_outputs["batch_loss"] = batch_loss
	model_outputs["valid_token_num"] = self.valid_token_num

	# override last_hidden_states (used in generation)
	beacon_size = self._beacon_sizes[-1]
	# remove logits corresponding to beacon tokens
	if beacon_size > 0:
	model_outputs["logits"] = model_outputs["logits"][:, :-beacon_size]

	# print(f"process {dist.get_rank()}: loss {loss}")
	# print(f"process {dist.get_rank()}: beacon_sizes {self._beacon_sizes}")
	return model_outputs


	def slice_tensor(x, start=None, end=None, dim=2):
	if x is None:
	return None
	if end == 0:
	return None
	if start == x.shape[dim]:
	return None
	if start == end:
	return None
	if dim == 2:
	if start is None and end is not None:
	return x[:, :, :end, ...]
	elif start is not None and end is None:
	return x[:, :, start:, ...]
	elif start is not None and end is not None:
	return x[:, :, start:end, ...]
	elif dim == 1:
	if start is None and end is not None:
	return x[:, :end, ...]
	elif start is not None and end is None:
	return x[:, start:, ...]
	elif start is not None and end is not None:
	return x[:, start:end, ...]
	else:
	raise NotImplementedError

	def cat_tensor(list_of_tensors, dim=-1):
	list_of_tensors = [t for t in list_of_tensors if t is not None]
	if len(list_of_tensors) > 1:
	result = torch.cat(list_of_tensors, dim=dim)
	elif len(list_of_tensors) == 1:
	result = list_of_tensors[0]
	else:
	result = None
	return result

	def softmax(x:np.ndarray, axis=-1, temperature=1):
	if isinstance(x, list):
	x = np.array(x)
	x = x / temperature
	x = x - x.max(axis=axis, keepdims=True)
	y = np.exp(x)
	return y / y.sum(axis=axis, keepdims=True)

	def l1_norm(x):
	sum_x = sum(x)
	x = [y/sum_x for y in x]
	return x