from pathlib import Path import gradio as gr # import polars as pl import pandas as pd import torch import json from gradio import ChatMessage import os import matplotlib.pyplot as plt IN_SPACE = bool(os.environ.get("SPACE_AUTHOR_NAME", False)) files = [ "./lmsys-ex38-model_oof_df.parquet", "./lmsys-ex41-model_oof_df.parquet", "./lmsys-ex43-model_oof_df.parquet", "./lmsys-exp-llm-049-weight_preds.parquet", "./lmsys-exp-llm-053-weight_preds.parquet", "./lmsys-exp-llm-063-weight_preds.parquet", "./lmsys-exp-llm-065-weight_preds.parquet", "./lmsys-exp-llm-073-weight_preds.parquet", "./lmsys-exp-llm-078-weight_preds.parquet", "./lmsys-exp-llm-081-weight_preds.parquet", "./lmsys-exp-llm-085-weight_preds.parquet", "./lmsys-oof-exp2_preds.parquet", "./lmsys-oof-exp29_preds.parquet", ] train_filepath = "./train.parquet" if not IN_SPACE: files = [x.replace("./", "../../data/oofs/") for x in files] train_filepath = "../../data/train.parquet" from dotenv import load_dotenv loaded = load_dotenv("../../.env") print("Loaded .env file:", loaded) HF_TOKEN = os.getenv("HF_READ_OOFS_TOKEN") if not HF_TOKEN: print("be sure to set HF_READ_OOFS_TOKEN in .env file") if not Path(files[0]).exists(): from huggingface_hub import snapshot_download, login login(token=HF_TOKEN) snapshot_download("nbroad/lmsys-cahpp-oofs", repo_type="dataset", local_dir="./", local_dir_use_symlinks=False) exps = {} for f in files: if "lmsys-exp-llm-" in f: exp = f.split("lmsys-exp-llm-")[1].split("-")[0] elif "lmsys-ex" in f: exp = f.split("lmsys-ex")[1].split("-")[0] elif "lmsys-oof-exp" in f: exp = f.split("lmsys-oof-exp")[1].split("_")[0] exps[f] = exp exps[f.split("/")[-1]] = exp def make_df(): data = {f: pd.read_parquet(f) for f in files} for k in data.keys(): exp = exps[k] if "0" in data[k].columns: data[k] = data[k].rename( columns={ "0": f"winner_model_a_prob_{exp}", "1": f"winner_model_b_prob_{exp}", "2": f"winner_tie_prob_{exp}", }, ) elif "winner_tie_prob" not in data[k].columns: data[k] = data[k].rename( columns={ "winner_model_a": f"winner_model_a_prob_{exp}", "winner_model_b": f"winner_model_b_prob_{exp}", "winner_tie": f"winner_tie_prob_{exp}", } ) else: data[k] = data[k].rename( columns={ "winner_model_a_prob": f"winner_model_a_prob_{exp}", "winner_model_b_prob": f"winner_model_b_prob_{exp}", "winner_tie_prob": f"winner_tie_prob_{exp}", } ) pred_cols = [ f"winner_model_a_prob_{exp}", f"winner_model_b_prob_{exp}", f"winner_tie_prob_{exp}", ] data[k] = data[k].sort_values("id") final_columns = ["id"] + pred_cols data[k] = data[k][final_columns] id_col = data[files[0]].iloc[:, 0] joined = pd.concat([x.drop("id", axis=1) for x in data.values()], axis=1) joined["id"] = id_col tdf = pd.read_parquet(train_filepath) joined = joined.merge(tdf, on="id", how="left") joined["winner"] = "" joined.loc[joined["winner_model_a"] == 1, "winner"] = "A" joined.loc[joined["winner_model_b"] == 1, "winner"] = "B" joined.loc[joined["winner_tie"] == 1, "winner"] = "Tie" for exp in exps.values(): pred_cols = [ f"winner_model_a_prob_{exp}", f"winner_model_b_prob_{exp}", f"winner_tie_prob_{exp}", ] temp_scores = joined[pred_cols].values if temp_scores.sum(axis=-1).max() > 1.1: temp_scores = torch.tensor(temp_scores).softmax(-1) else: temp_scores = torch.tensor(temp_scores) joined[pred_cols] = temp_scores.numpy() gt_idxs = joined[ ["winner_model_a", "winner_model_b", "winner_tie"] ].values.argsort()[:, -1] temp = temp_scores[torch.arange(temp_scores.shape[0]), gt_idxs] joined[f"loss_{exp}"] = torch.nn.functional.binary_cross_entropy( temp, torch.ones(len(temp), dtype=torch.float64), reduction="none" ) joined["prompt_length"] = [len(x) for x in joined["prompt"]] joined["response_a_length"] = [len(x) for x in joined["response_a"]] joined["response_b_length"] = [len(x) for x in joined["response_b"]] joined["total_length"] = ( joined["prompt_length"] + joined["response_a_length"] + joined["response_b_length"] ) loss_cols = [x for x in joined.columns if "loss" in x] joined["avg_loss"] = joined[loss_cols].mean(axis=1) joined["avg_winner_model_a"] = joined[ [x for x in joined.columns if "winner_model_a_prob" in x] ].mean(axis=1) joined["avg_winner_model_b"] = joined[ [x for x in joined.columns if "winner_model_b_prob" in x] ].mean(axis=1) joined["avg_winner_tie"] = joined[ [x for x in joined.columns if "winner_tie_prob" in x] ].mean(axis=1) prob_cols = [x for x in joined.columns if "prob" in x] loss_cols = [x for x in joined.columns if "loss" in x] joined[prob_cols + loss_cols] = joined[prob_cols + loss_cols].astype("float16") id2texts = {i: (p, a, b) for i, p, a, b in joined[["id", "prompt", "response_a", "response_b"]].values} joined = joined.drop(columns=["prompt", "response_a", "response_b"]) return joined, id2texts MAIN_DF, id2texts = make_df() def filter_df(lower_limit, upper_limit, file, all_check): if all_check or file is None or file == "": loss_col = "avg_loss" else: loss_col = f"loss_{exps[file]}" temp = MAIN_DF[ (MAIN_DF[loss_col] > lower_limit) & (MAIN_DF[loss_col] < upper_limit) ] temp = temp.sort_values(loss_col, ascending=False).reset_index(drop=True) return 0, temp def make_chat(prompt, response, side, label): prompts = json.loads(prompt) responses = json.loads(response) header = None if side == label: header = "✅ Winner ✅" elif label == 2 or label == "Tie": header = "🟨 Tie 🟨" else: header = "❌ Loser ❌" chat = [] for p, r in zip(prompts, responses): chat.append( ChatMessage( role="user", content=header + "\n" + p, ) ) if r is None: r = "" chat.append(ChatMessage(role="assistant", content=header + "\n" + r)) return chat def show_chats(idx, df, file, all_check): if idx is None: return None, None if idx >= df.shape[0]: idx = df.shape[0] - 1 if idx < 0: idx = 0 row = df.iloc[idx] label = row["winner"] id_ = row["id"] p, a, b = id2texts[id_] chat_a = make_chat(p, a, "A", label) chat_b = make_chat(p, b, "B", label) # chat_a = make_chat(row["prompt"], row["response_a"], 0, label_idx) # chat_b = make_chat(row["prompt"], row["response_b"], 1, label_idx) if all_check or file is None or file == "": score_cols = ["avg_winner_model_a", "avg_winner_model_b", "avg_winner_tie"] else: score_cols = [ f"winner_model_a_prob_{exps[file]}", f"winner_model_b_prob_{exps[file]}", f"winner_tie_prob_{exps[file]}", ] scores = row[score_cols].to_list() if all_check or file is None or file == "": loss_col = "avg_loss" else: loss_col = f"loss_{exps[file]}" loss = row[loss_col] # labels = ["A", "B", "Tie"] return chat_a, chat_b, label, *scores, loss def show_split(text): if len(text) == 0: gr.Markdown("## No Input Provided") else: for letter in text: with gr.Row(): text = gr.Textbox(letter) btn = gr.Button("Clear") gr.Textbox(value=""), None, text) def update_plot(df, file, all_check): print(df.columns) print("avg_loss" in df.columns) if all_check or file is None or file == "": loss_col = "avg_loss" else: loss_col = f"loss_{exps[file]}" return plt.hist(df[loss_col], bins=50) with gr.Blocks() as demo: gr.Markdown( """ # OOF Visualization This is a demo for visualizing the out-of-fold predictions of a model. It currently shows the predictions for the outputs of [this notebook]( """ ) with gr.Row(): with gr.Column(): file = gr.Dropdown(label="File", choices=[x.split("/")[-1] for x in files]) with gr.Column(): all_check = gr.Checkbox(label="Use average loss of all files", value=True) with gr.Row(): lower_limit = gr.Slider( label="Show samples with loss > this value", minimum=0, maximum=5, value=1 ) upper_limit = gr.Slider( label="Show samples with loss < this value", minimum=0, maximum=5, value=5 ) # id_ = gr.Number(label="ID") idx = gr.Number(visible=True) hidden_df = gr.Dataframe(visible=False) with gr.Row(): correct_label = gr.Textbox(label="Correct Label", interactive=False) score_a = gr.Textbox(label="Model A Score", interactive=False) score_b = gr.Textbox(label="Model B Score", interactive=False) score_tie = gr.Textbox(label="Tie Score", interactive=False) loss = gr.Textbox(label="Loss", interactive=False) with gr.Row(): with gr.Column(): prev_btn = gr.Button(value="Previous") with gr.Column(): next_btn = gr.Button(value="Next") with gr.Row(): with gr.Column(): chat_a = gr.Chatbot(label="Model A", type="messages", height=1000) with gr.Column(): chat_b = gr.Chatbot(label="Model B", type="messages", height=1000) # with gr.Row(): # plot = gr.Plot() lower_limit.change( filter_df, inputs=[lower_limit, upper_limit, file, all_check], outputs=[idx, hidden_df], ) upper_limit.change( filter_df, inputs=[lower_limit, upper_limit, file, all_check], outputs=[idx, hidden_df], ) # hidden_df.change(update_plot, [hidden_df, file, all_check], plot) idx.change( show_chats, inputs=[idx, hidden_df, file, all_check], outputs=[chat_a, chat_b, correct_label, score_a, score_b, score_tie, loss], ) x: max(0, x - 1), inputs=idx, outputs=idx) x: x + 1, inputs=idx, outputs=idx) if __name__ == "__main__": demo.launch(debug=True)