import torch
import numpy as np
import torch.distributed as dist
from transformers.utils import logging
from typing import List, Tuple, Optional
from .modeling_retrieval import BM25Retriever

logger = logging.get_logger(__name__)


class Memory(torch.nn.Module):
    def __init__(self, model_config, beacon_window:int=1024, beacon_stride:List[int]=[512], beacon_attn:str="step-expansion", beacon_attend_previous:bool=True, beacon_ratio:List[int]=[8], beacon_stride_mix:str="step-random", beacon_ratio_mix:str="step-random", beacon_param:List[str]=["q", "k", "v", "o"], k_seq_dim:int=2, v_seq_dim:int=2, retrieval_method:str=None, retrieval_topk:int=2) -> None:
        super().__init__()

        for stride in beacon_stride:
            assert beacon_window >= stride, f"Make sure the beacon_window {beacon_window} >= beacon_stride {stride}!"
        assert beacon_attn in ["segmentation", "step-expansion", "full-coverage"], f"beacon_attn {beacon_attn} not implemented!"
        assert beacon_stride_mix in ["instance-random", "step-random", "mix-random"], f"beacon_stride_mix {beacon_stride_mix} not implemented!"
        assert beacon_ratio_mix in ["instance-random", "step-random", "mix-random", "sequence"] or "adapt-" in beacon_ratio_mix, f"beacon_ratio_mix {beacon_ratio_mix} not implemented!"

        if retrieval_method == "bm25":
            assert len(beacon_stride) == 1, f"Currently retrieval do not support dynamic strides."
            assert retrieval_topk >= 2, f"Make sure retrieval_topk >= 2. Found {retrieval_topk}."
            assert len(beacon_ratio) == 2, f"Make sure there are two beacon ratios specified, one for retrieved windows and the other for non-retrieved windows. Found {self.beacon_ratio}"

        info = f"applying activation beacon on {beacon_param}, with window size {beacon_window}, stride {beacon_stride} (mixed by {beacon_stride_mix}), {beacon_attn} attention ({'attending to previous beacons' if beacon_attend_previous else 'not attending to previous beacons'}), condensing ratio {beacon_ratio} (mixed by {beacon_ratio_mix}), {retrieval_method+' retrieval'+' top-'+str(retrieval_topk) if retrieval_method is not None else 'no retrieval'}, ..."
        logger.info(info)

        self.beacon_window = beacon_window
        self.beacon_stride = beacon_stride
        self.beacon_attn = beacon_attn
        self.beacon_ratio = beacon_ratio
        self.beacon_stride_mix = beacon_stride_mix
        self.beacon_ratio_mix = beacon_ratio_mix
        max_beacon_size = max([beacon_window // x for x in beacon_ratio if x > 0] + [1])
        self.beacon_tokens = torch.zeros(max_beacon_size, dtype=torch.long) + model_config.vocab_size
        
        # initialize necessary parameters
        self.k_seq_dim = k_seq_dim
        self.v_seq_dim = v_seq_dim
        self.num_layers = model_config.num_hidden_layers
        self.max_position_embeddings = model_config.max_position_embeddings

        self.retrieval_method = retrieval_method
        self.retrieval_topk = retrieval_topk

        self.rng = np.random.default_rng(42)
        self.reset()
    
    @property
    def finish(self):
        return self.end_idx == self.sequence_length
    
    def get_memory_size(self):
        beacon_memory_size = 0
        raw_memory_size = 0
        if self.beacon_activations[0][0] is not None:
            beacon_memory_size += self.beacon_activations[0][0].shape[self.k_seq_dim]
        if self.raw_activations[0][0] is not None:
            raw_memory_size += self.raw_activations[0][0].shape[self.k_seq_dim]
        memory_size = beacon_memory_size + raw_memory_size
        return beacon_memory_size, raw_memory_size, memory_size

    def reset(self):
        # the length of current sequence
        self.sequence_length = 0
        # the length of all sequences until the memory is reset
        self.total_sequence_length = 0
        # the cursor pointing to the start of the current window
        self.start_idx = 0
        # the cursor pointing to the end of the current window
        self.end_idx = 0
        # the beacon sizes of all strides
        self._beacon_sizes = []
        # the step index
        self.step_idx = 0

        if self.beacon_ratio_mix != "step-random":
            self._stride = None
            self._ratio = None

        self.batch_loss = None
        self.valid_token_num = None

        self.raw_activations = [(None, None) for _ in range(self.num_layers)]
        self.beacon_activations = [(None, None) for _ in range(self.num_layers)]

        if self.retrieval_method == "bm25":
            self.retriever = BM25Retriever()
            self.beacon_ratio_mix = "retrieval"

        # NOTE: when training, we strictly aligh the rng_state across processes
        if self.training and dist.is_initialized():
            rng_state = self.rng.__getstate__()
            if dist.get_rank() == 0:
                obj = [rng_state]
            else:
                obj = [None]
            dist.broadcast_object_list(obj, src=0)
            self.rng.__setstate__(obj[0])

    def prepare(self, input_ids, attention_mask, labels):
        """
        Prepare inputs for the model.
        """
        # TODO: support batch_size > 1?
        assert input_ids.shape[0] == 1, f"Make sure batch_size is 1!"
        
        # NOTE: rebase the start/end idx so that it becomes an offset from the start of the current sequence
        self.start_idx -= self.sequence_length
        self.end_idx -= self.sequence_length

        self.sequence_length = input_ids.shape[1]
        self.total_sequence_length += input_ids.shape[1]

        if labels is not None:
            # rotate labels in advance so that the loss of the last token is not ignored in every window
            labels = torch.cat([labels[:, 1:], labels.new_zeros((labels.shape[0], 1)) - 100], dim=-1)

        # if the current sequence has been completely processed
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

        # TODO: retrieval on specified steps
        # TODO: retrieval for future inputs
        # TODO: different retrieval methods
        if self.retrieval_method == "bm25" and self.step_idx == 0:
            index = BM25Retriever()
            window = self.beacon_window
            stride = self.beacon_stride[0]

            input_ids = input_ids[0].tolist()
            corpus = [input_ids[:window]]
            for j in range(window, len(input_ids), stride):
                corpus.append(input_ids[j: j + stride])

            # NOTE: use the last 32 token as query (very naive heuristics)
            query = input_ids[-32:]
            index.index(corpus)
            topk_scores, topk_indices = index.search(query, hits=self.retrieval_topk)
            topk_indices = set([x for x in topk_indices[0] if x > -1])

            self._topk_indices = topk_indices

    def set_stride(self):
        """Choose a stride from self.beacon_stride"""
        beacon_stride = self.beacon_stride
        
        if len(beacon_stride) == 1:
            return beacon_stride[0]

        if self.beacon_stride_mix == "mix-random":
            stride_mix = self.rng.choice(["instance-random", "step-random"]).tolist()
        else:
            stride_mix = self.beacon_stride_mix
        
        if stride_mix == "instance-random":
            if self._beacon_stride is None:
                stride = self.rng.choice(beacon_stride).tolist()
                self._stride = stride
            else:
                stride = self._stride

        elif stride_mix == "step-random":
            stride = self.rng.choice(beacon_stride).tolist()

        else:
            raise NotImplementedError

        return stride

    def set_condensing_ratio(self, beacon_stride, start_idx, end_idx):
        """Choose a condensing ratio from self.beacon_ratio"""
        def filter_ratio(ratios, stride):
            valid_ratios = []
            for ratio in ratios:
                # stride must be bigger than condensing ratio because we there must be at least one beacon
                if stride < ratio:
                    continue
                # step-expansion and segmentation requires the stride to be evenly divisible by condensing ratio
                if self.beacon_attn != "full-coverage" and ratio > 0 and (stride % ratio) != 0:
                    continue
                # when training, ratio=0 is valid if previous windows contain beacon or later windows contain beacon
                if ratio == 0 and self.training:
                    previous_beacons = [b for b in self._beacon_sizes if b != -1]
                    following_beacons = (start_idx + stride + self.beacon_window) <= self.sequence_length
                    if len(previous_beacons) == 0 and not following_beacons:
                        continue
                valid_ratios.append(ratio)
            assert len(valid_ratios), f"Cannot find valid condensing ratio (among {ratios}) for stride {stride}!"
            return valid_ratios

        def get_max_length(ratios):
            max_lengths = []
            for condensing_ratio in ratios:
                if condensing_ratio > 0:
                    max_lengths.append((self.max_position_embeddings - self.beacon_window) * condensing_ratio + self.beacon_window)
                else:
                    max_lengths.append(self.max_position_embeddings)
            return max_lengths

        if len(self.beacon_ratio) == 1:
            return self.beacon_ratio[0]

        beacon_ratio = filter_ratio(self.beacon_ratio, beacon_stride)

        if self.beacon_ratio_mix == "mix-random":
            ratio_mix = self.rng.choice(["instance-random", "step-random"]).tolist()
        else:
            ratio_mix = self.beacon_ratio_mix

        if ratio_mix == "instance-random":
            if self._ratio is None:
                beacon_ratio = self.rng.choice(beacon_ratio).tolist()
                self._ratio = beacon_ratio
            else:
                beacon_ratio = self._ratio

        elif ratio_mix == "step-random":
            beacon_ratio = self.rng.choice(beacon_ratio).tolist()
        
        elif ratio_mix == "sequence":
            idx = min(self.step_idx, len(beacon_ratio) - 1)
            beacon_ratio = beacon_ratio[idx]
        
        elif ratio_mix == "retrieval":
            # for retrieved windows, we use low ratio; otherwise high ratio
            if self.step_idx in self._topk_indices:
                beacon_ratio = min(self.beacon_ratio)
            else:
                beacon_ratio = max(self.beacon_ratio)

        elif "adapt" in ratio_mix:
            if self._ratio is None:
                future_length = int(ratio_mix.split("-")[1])
                sequence_length = self.total_sequence_length + future_length
                max_lengths = get_max_length(beacon_ratio)
                # ascendingly sort the max lengths
                valid_max_lengths_and_indices = [x for x in enumerate(max_lengths) if x[1] >= sequence_length]
                if len(valid_max_lengths_and_indices):
                    minimum_length_index = min(valid_max_lengths_and_indices, key=lambda x: x[1])[0]
                    # use the minimal possible length for this sequence (the smallest fold ratio)
                    beacon_ratio = beacon_ratio[minimum_length_index]
                else:
                    beacon_ratio = max(beacon_ratio)
                    # logger.warning(f"Failed to find valid fold window and size for sequence length {sequence_length}, as the maximum theoretical length is {max(max_lengths)}. Fall back to use the maximum one: {beacon_ratio}.")
                self._ratio = beacon_ratio
            else:
                beacon_ratio = self._ratio

        return beacon_ratio

    def step(self):
        """
        Yield one window with the following logic:

        The window size is L, the stride is S.
        The window moves over S tokens at a time. The raw activations passed by the window are condensed according to a condensing_ratio.
        The beacons are added if and only if the raw activations fulfill the window.
        In the future, we may switch window size to decrease cache size of raw activations.
        """
        # the starting position of the current window w.r.t. the start of the current input sequence
        start_idx = self.start_idx
        # the end position of the current window w.r.t. the start of the current input sequence
        end_idx = start_idx + self.beacon_window

        # indicates if the current window is completely filled by raw activations and new tokens
        # we only append beacon tokens for full windows
        is_full_window = True
        if end_idx > self.sequence_length:
            # the input is shorter than the initial window size
            end_idx = self.sequence_length
            is_full_window = False

        # the real window size (remaining_size + new_token_size)
        window_size = end_idx - start_idx

        if is_full_window:
            # set stride and condensing ratio
            beacon_stride = self.set_stride()
            condensing_ratio = self.set_condensing_ratio(beacon_stride, start_idx=start_idx, end_idx=end_idx)

            # the stride must be evenly divisible by condensing_ratio
            if condensing_ratio > 0:
                beacon_size = beacon_stride // condensing_ratio
            else:
                # the raw activations are used as beacon activations
                beacon_size = -1
            # forward start_idx and end_idx
            next_start_idx = start_idx + beacon_stride
            # how many raw activations to save
            raw_size_to_cache = end_idx - next_start_idx
            self.remaining_size = 0
        else:
            # no stride because the sequence has finished
            next_start_idx = start_idx
            # cache all recent raw activations to be used in the next window
            raw_size_to_cache = window_size
            self.remaining_size = window_size
            beacon_size = 0

        # this is for debugging the resilience to different beacons
        # if self.step_idx == 97:
        #     a = torch.load("beacon_activations")
        #     for i, (beacon_key, beacon_value) in enumerate(self.beacon_activations):
        #         foreign_beacon_key = a[i][0]
        #         foreign_beacon_value = a[i][1]
        #         new_beacon_key = cat_tensor([foreign_beacon_key, slice_tensor(beacon_key, start=16, dim=self.k_seq_dim)], dim=self.k_seq_dim)
        #         new_beacon_value = cat_tensor([foreign_beacon_value, slice_tensor(beacon_value, start=16, dim=self.v_seq_dim)], dim=self.v_seq_dim)
        #         new_beacon_key = foreign_beacon_key
        #         new_beacon_value = foreign_beacon_value
        #         self.beacon_activations[i] = (new_beacon_key, new_beacon_value)

        # streamingly add new input_ids
        input_ids = self.input_ids[:, self.end_idx: end_idx]
        batch_size = input_ids.shape[0]
        if self.attention_mask is not None:
            attention_mask = self.attention_mask[:, self.end_idx: end_idx]
        else:
            attention_mask = torch.ones_like(input_ids)
        if self.labels is not None:
            labels = self.labels[:, self.end_idx: end_idx]
        else:
            labels = None
        # prepend 1 to attention mask for previous memory
        _, _, memory_size = self.get_memory_size()
        if memory_size > 0:
            attention_mask = torch.cat([attention_mask.new_ones(batch_size, memory_size), attention_mask], dim=1)
        
        # append beacons if necessary
        if is_full_window and beacon_size > 0:
            input_ids = torch.cat([input_ids, self.beacon_tokens[:beacon_size].expand(batch_size, -1).to(input_ids.device, dtype=input_ids.dtype)], dim=1)
            # NOTE: prepend beacon_memory_size 1 to attention_mask because we have past_key_values
            attention_mask = torch.cat([attention_mask, attention_mask.new_ones(batch_size, beacon_size)], dim=1)
            if labels is not None:
                labels = torch.cat([labels, labels.new_zeros(batch_size, beacon_size) - 100], dim=1)

        # generate memory (memory_length = old_beacon_size + beacon_size * condensing_ratio + raw_cache_size)
        past_key_values = []
        for (beacon_key, beacon_value), (raw_key, raw_value) in zip(self.beacon_activations, self.raw_activations):
            key = cat_tensor([beacon_key, raw_key], dim=self.k_seq_dim)
            value = cat_tensor([beacon_value, raw_value], dim=self.v_seq_dim)
            layer_past_key_values = (key, value, beacon_size, raw_size_to_cache, window_size)
            past_key_values.append(layer_past_key_values)

        # involked in self.output()
        self._beacon_sizes.append(beacon_size)
        # update end_idx
        self.start_idx = next_start_idx
        self.end_idx = end_idx
        self.step_idx += 1

        # print("****************************************")
        # if is_full_window:
        #     print(f"total_seq_len:      {self.total_sequence_length}")
        #     print(f"stride:             {beacon_stride}")
        #     print(f"condensing ratio:   {condensing_ratio}")
        #     print(f"beacon_size:        {beacon_size}")
        # print(f"input_ids:          {input_ids.shape}")
        # print(f"start_idx:          {start_idx}")
        # print(f"next_start_idx:     {next_start_idx}")
        # print(f"end_idx:            {end_idx}")
        # x = input()
        # if x == "s":
        #     return
        # if self.step_idx == 3:
        #     input()

        return input_ids, attention_mask, past_key_values, labels

    def update_memory(self, past_key_values):
        """
        Accumulate beacon activations and raw activations.
        """
        for layer_idx, (key, value, beacon_size, raw_size_to_cache, window_size) in enumerate(past_key_values):
            # NOTE: the past_key_values are incrementally returned (only the new keys and values are returned)

            # key/value: (num_layer, 2, batch_size, num_head, new_seq_len, head_dim)
            # beacon_size: how many beacon activations are in key and value
            # raw_size_to_cache: how many raw activations should be kept

            previous_beacon_key, previous_beacon_value = self.beacon_activations[layer_idx]
            previous_raw_key, previous_raw_value = self.raw_activations[layer_idx]
            
            if beacon_size == 0:
                # this means the current input does not fulfill a window
                # thus, the key and value are all raw activations, and we accumulate them until the window is fulfilled
                beacon_key = previous_beacon_key
                beacon_value = previous_beacon_value

                assert raw_size_to_cache == window_size
                raw_key = cat_tensor([
                    previous_raw_key,
                    key
                ], dim=self.k_seq_dim)
                raw_value = cat_tensor([
                    previous_raw_value, 
                    value
                ], dim=self.v_seq_dim)

            elif beacon_size == -1:
                # this means the raw activations are used as beacon activations for this window

                if raw_size_to_cache > 0:
                    # if we have raw activations, we must first concatenate previous raw activations and current ones, then extract raw_size_to_cache as raw memory, while others as beacon memory
                    concat_key = cat_tensor([
                        previous_raw_key, 
                        key
                    ], dim=self.k_seq_dim)
                    concat_value = cat_tensor([
                        previous_raw_value, 
                        value
                    ], dim=self.v_seq_dim)

                    beacon_key = cat_tensor([
                        previous_beacon_key,
                        slice_tensor(concat_key, end=-raw_size_to_cache, dim=self.k_seq_dim)
                    ], dim=self.k_seq_dim)
                    beacon_value = cat_tensor([
                        previous_beacon_value,
                        slice_tensor(concat_value, end=-raw_size_to_cache, dim=self.v_seq_dim)
                    ], dim=self.v_seq_dim)
                    raw_key = slice_tensor(concat_key, start=-raw_size_to_cache, dim=self.k_seq_dim)
                    raw_value = slice_tensor(concat_value, start=-raw_size_to_cache, dim=self.v_seq_dim)

                else:
                    # if we donot have raw activations, this means stride==window. we put all into beacon memory
                    beacon_key = cat_tensor([
                        previous_beacon_key,
                        key,
                    ], dim=self.k_seq_dim)
                    beacon_value = cat_tensor([
                        previous_beacon_value,
                        value,
                    ], dim=self.v_seq_dim)
                    raw_key = None
                    raw_value = None

            else:
                # [-beacon_size:] activations are from beacons, need to be accumulated
                # [-raw_cache_size-beacon_size:-beacon_size] raw activations will be cached; if they are shorter than raw_cache_size, part of the previous raw activations will also be kept

                beacon_key = cat_tensor([
                    previous_beacon_key,
                    slice_tensor(key, start=-beacon_size, dim=self.k_seq_dim)
                ], dim=self.k_seq_dim)
                beacon_value = cat_tensor([
                    previous_beacon_value,
                    slice_tensor(value, start=-beacon_size, dim=self.v_seq_dim)
                ], dim=self.v_seq_dim)

                if key.shape[self.k_seq_dim] < raw_size_to_cache + beacon_size:
                    concat_raw_key = cat_tensor([
                        previous_raw_key, 
                        slice_tensor(key, end=-beacon_size, dim=self.k_seq_dim)
                    ], dim=self.k_seq_dim)
                    concat_raw_value = cat_tensor([
                        previous_raw_value, 
                        slice_tensor(value, end=-beacon_size, dim=self.v_seq_dim)
                    ], dim=self.v_seq_dim)
                    raw_key = slice_tensor(concat_raw_key, start=-raw_size_to_cache, dim=self.k_seq_dim)
                    raw_value = slice_tensor(concat_raw_value, start=-raw_size_to_cache, dim=self.v_seq_dim)
                else:
                    # becomes None when raw_size_to_cache = 0
                    raw_key = slice_tensor(key, start=-raw_size_to_cache - beacon_size, end=-beacon_size, dim=self.k_seq_dim)
                    raw_value = slice_tensor(value, start=-raw_size_to_cache - beacon_size, end=-beacon_size, dim=self.v_seq_dim)

            self.beacon_activations[layer_idx] = (beacon_key, beacon_value)
            self.raw_activations[layer_idx] = (raw_key, raw_value)

        # NOTE: this is for debugging the resilience to different beacons
        # if self.step_idx == 2:
        #     print(self.get_memory_size())
        #     torch.save(self.beacon_activations, "beacon_activations")

    def update_loss(self, batch_loss, valid_token_num):
        """
        Accumulate loss for later perplexity computation and backward pass; past_key_values according to cache_method.
        """
        if self.batch_loss is None:
            # NOTE: multiply valid_token_num because batch_loss is divided by it in advance
            self.batch_loss = batch_loss * valid_token_num
            self.valid_token_num = valid_token_num
        else:
            # NOTE: avoid in-place operations, otherwise there will be gradient errors in training
            self.batch_loss = self.batch_loss + batch_loss * valid_token_num
            self.valid_token_num = self.valid_token_num + valid_token_num

    def output(self, model_outputs):
        """
        Override loss with accumulated loss.
        """
        # override loss
        if self.batch_loss is not None:
            # here the batch_loss is the summation of all token losses in each element
            loss = self.batch_loss.sum() / self.valid_token_num.sum()

            # NOTE: prevent nan
            batch_loss = self.batch_loss / self.valid_token_num
            if (self.valid_token_num == 0).any():
                batch_loss = batch_loss.masked_fill(self.valid_token_num == 0, 0.)

            # NOTE: we must use dict to override values, otherwise trainer cannot find loss
            model_outputs["loss"] = loss
            model_outputs["batch_loss"] = batch_loss
            model_outputs["valid_token_num"] = self.valid_token_num

        # override last_hidden_states (used in generation)
        beacon_size = self._beacon_sizes[-1]
        # remove logits corresponding to beacon tokens
        if beacon_size > 0:
            model_outputs["logits"] = model_outputs["logits"][:, :-beacon_size]

        # print(f"process {dist.get_rank()}: loss {loss}")
        # print(f"process {dist.get_rank()}: beacon_sizes {self._beacon_sizes}")
        return model_outputs


def slice_tensor(x, start=None, end=None, dim=2):
    if x is None:
        return None
    if end == 0:
        return None
    if start == x.shape[dim]:
        return None
    if start == end:
        return None
    if dim == 2:
        if start is None and end is not None:
            return x[:, :, :end, ...]
        elif start is not None and end is None:
            return x[:, :, start:, ...]
        elif start is not None and end is not None:
            return x[:, :, start:end, ...]
    elif dim == 1:
        if start is None and end is not None:
            return x[:, :end, ...]
        elif start is not None and end is None:
            return x[:, start:, ...]
        elif start is not None and end is not None:
            return x[:, start:end, ...]
    else:
        raise NotImplementedError

def cat_tensor(list_of_tensors, dim=-1):
    list_of_tensors = [t for t in list_of_tensors if t is not None]
    if len(list_of_tensors) > 1:
        result = torch.cat(list_of_tensors, dim=dim)
    elif len(list_of_tensors) == 1:
        result = list_of_tensors[0]
    else:
        result = None
    return result

def softmax(x:np.ndarray, axis=-1, temperature=1):
    if isinstance(x, list):
        x = np.array(x)
    x = x / temperature
    x = x - x.max(axis=axis, keepdims=True)
    y = np.exp(x)
    return y / y.sum(axis=axis, keepdims=True)

def l1_norm(x):
    sum_x = sum(x)
    x = [y/sum_x for y in x]
    return x