how_many_data_points_zh

Sleeping

App Files Files Community

MatrixYao

julien-c HF staff commited on Jul 5, 2023

Commit

9292fbb

•

0 Parent(s):

Duplicate from teven-projects/how_many_data_points

Browse files

Co-authored-by: Julien Chaumond <[email protected]>

Files changed (9) hide show

.gitattributes +34 -0
.gitignore +6 -0
Dockerfile +11 -0
README.md +12 -0
naacl_demo/demo_utils.py +514 -0
naacl_demo/main.py +294 -0
naacl_demo/text.md +82 -0
naacl_demo/text.py +169 -0
requirements.txt +22 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,34 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,6 @@

+__pycache__/
+*.py[cod]
+*$py.class
+.env/

Dockerfile ADDED Viewed

	@@ -0,0 +1,11 @@

+FROM python:3.7
+WORKDIR /code
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+COPY . .
+CMD ["bokeh", "serve", "naacl_demo", "--allow-websocket-origin=*"]

README.md ADDED Viewed

	@@ -0,0 +1,12 @@

+---
+title: How Many Data Points
+emoji: 🦀
+colorFrom: red
+colorTo: yellow
+sdk: docker
+pinned: false
+app_port: 5006
+duplicated_from: teven-projects/how_many_data_points
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

naacl_demo/demo_utils.py ADDED Viewed

	@@ -0,0 +1,514 @@

+import math
+import pandas as pd
+import numpy as np
+from itertools import product
+import shapely
+from bokeh.models import Span, Label, ColumnDataSource, Whisker
+from bokeh.plotting import figure, show
+from shapely.geometry import Polygon
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+import seaborn
+task_patterns = {
+    "CB": [0, 3],
+    "RTE": [0, 3],
+    "BoolQ": [0, 3, 5],
+    "MNLI": [0, 3],
+    "COPA": [0, 1],
+    "WSC": [0, 1, 2],
+    "WiC": [0, 1],
+    "MultiRC": [0, 1, 2],
+}
+task_reps = {"CB": 4, "RTE": 4, "BoolQ": 4, "MNLI": 4, "COPA": 4, "WSC": 4, "WiC": 4, "MultiRC": 4}
+task_best_pattern = {"CB": 0, "RTE": 0, "BoolQ": 0, "MNLI": 0, "COPA": 1, "WSC": 0, "WiC": 0, "MultiRC": 1}
+task_metric_short = {
+    "CB": "f1-macro",
+    "RTE": "acc",
+    "BoolQ": "acc",
+    "MNLI": "acc",
+    "COPA": "acc",
+    "WSC": "acc",
+    "WiC": "acc",
+    "MultiRC": "f1",
+}
+task_metrics = {
+    "CB": "F1-macro",
+    "RTE": "accuracy",
+    "BoolQ": "accuracy",
+    "MNLI": "accuracy",
+    "COPA": "accuracy",
+    "WSC": "accuracy",
+    "WiC": "accuracy",
+    "MultiRC": "F1",
+}
+task_neutral = {
+    "CB": True,
+    "RTE": True,
+    "BoolQ": True,
+    "MNLI": True,
+    "COPA": False,
+    "WSC": False,
+    "multirc": True,
+    "WiC": True,
+    "MultiRC": True,
+}
+neutral_tasks = [
+    "BoolQ",
+    "CB",
+    "MNLI",
+    "MultiRC",
+    "RTE",
+    "WiC",
+]
+tasks = sorted(task_patterns.keys())
+pvp_colors = ["goldenrod", "blanchedalmond", "floralwhite"]
+ctl_colors = ["crimson", "salmon", "mistyrose"]
+clf_colors = ["indigo", "plum", "thistle"]
+def prompt_boolq(passage, question, pattern):
+    if pattern == 0:
+        return f"""<span style="color: #0c593d">{passage}</span> <span style="color: #910713"><b>Based on the previous passage,</b></span> <span style="color: #031154">{question}</span> <span style="color: #ba9004"><b>[YES/NO]</b></span>"""
+    if pattern == 1:
+        return f"""<span style="color: #0c593d">{passage}</span><span style="color: #910713"><b> Question:</b></span> <span style="color: #031154">{question}</span><span style="color: #910713"><b> Answer: </b></span><span style="color: #ba9004"><b>[YES/NO]</b></span>"""
+    if pattern == 2:
+        return f"""<span style="color: #910713"><b>Based on the following passage,</b></span> <span style="color: #031154">{question}</span><span style="color: #ba9004"><b> [YES/NO]</b></span> <span style="color: #0c593d">{passage}</span>"""
+def advantage_text(advantage):
+    model_type = (
+        """<span style="color: #4B0082">Head</span>"""
+        if advantage < 0
+        else """<span style="color: #daa520">Prompting</span>"""
+    )
+    return f"""<b>{model_type}</b> advantage: <b>{abs(advantage):.2f}</b> data points"""
+def average_advantage_text(advantage):
+    model_type = (
+        """<span style="color: #4B0082">head</span>"""
+        if advantage < 0
+        else """<span style="color: #daa520">prompting</span>"""
+    )
+    return f"""<b>Average {model_type}</b> advantage: <b>{abs(advantage):.2f}</b> data points"""
+def naming_convention(task, seed, pvp_index=None, neutral=False):
+    method = f"PVP {pvp_index}" if pvp_index is not None else "CLF"
+    model = "roberta"
+    if neutral:
+        verbalizer = "neutral"
+    else:
+        verbalizer = None
+    return (
+            f"{method} {model}"
+            + (f" {verbalizer} verbalizer" if verbalizer is not None else "")
+            + f" seed {seed} - test-{task_metric_short[task]}-all-p"
+    )
+def get_data(task):
+    url = f"https://raw.githubusercontent.com/TevenLeScao/pet/master/exported_results/{task.lower()}/wandb_export.csv"
+    df = pd.read_csv(url)
+    training_points = df["training_points"]
+    head_performances = np.transpose(np.array([df[naming_convention(task, i)] for i in range(task_reps[task])]))
+    pattern_performances = {}
+    for pattern in task_patterns[task]:
+        pattern_performances[pattern] = {
+            "normal": np.transpose(np.array([df[naming_convention(task, i, pattern)] for i in range(task_reps[task])]))
+        }
+        if task_neutral[task]:
+            pattern_performances[pattern]["neutral"] = np.transpose(
+                np.array([df[naming_convention(task, i, pattern, True)] for i in range(task_reps[task])])
+            )
+    return training_points, head_performances, pattern_performances
+def reduct(performances, reduction="accmax", final_pattern=0, verbalizer="normal", exclude=None):
+    # Combining the different runs for each experimental set-up
+    reducted = None
+    if isinstance(performances, dict):
+        performances = performances[final_pattern][verbalizer]
+    if exclude is not None:
+        performances = np.delete(performances, exclude, axis=1)
+    if reduction == "avg":
+        # Average
+        reducted = np.nanmean(performances, axis=1)
+    if reduction == "std":
+        # Standard deviation
+        reducted = np.nanstd(performances, axis=1)
+    if reduction == "max":
+        # Maximum
+        reducted = np.nanmax(performances, axis=1)
+    if reduction == "accmax":
+        # This makes the maximum curve monotonic
+        max_performance = np.nanmax(performances, axis=1)
+        reducted = np.maximum.accumulate(max_performance)
+    assert reducted is not None, "unrecognized reduction method"
+    return reducted
+def find_surrounding_points(perf, clf_results, pvp_results):
+    for i, clf_result in enumerate(clf_results):
+        if i - 1 > 0 and clf_result == clf_results[i - 1]:
+            continue
+        if clf_result > perf:
+            if i == 0:
+                raise ValueError(f"value {perf} too small")
+            else:
+                break
+    for j, pvp_result in enumerate(pvp_results):
+        if j - 1 > 0 and pvp_result == pvp_results[j - 1]:
+            continue
+        if pvp_result > perf:
+            if j == 0:
+                raise ValueError(f"value {perf} too small")
+            else:
+                break
+    return i - 1, j - 1
+def interpolate(perf, x1, x2, y1, y2):
+    return x1 + (perf - y1) * (x2 - x1) / (y2 - y1)
+def interpolate_from_idx(perf, idx, results, training_points):
+    return interpolate(perf, training_points[idx], training_points[idx + 1], results[idx], results[idx + 1])
+def interpolate_from_perf(perf, overlapping_range, training_points, clf_results, pvp_results):
+    if not overlapping_range[0] <= perf <= overlapping_range[1]:
+        raise ValueError(f"perf {perf} not in acceptable bounds {overlapping_range}")
+    clf_idx, pvp_idx = find_surrounding_points(perf, clf_results, pvp_results)
+    return interpolate_from_idx(perf, clf_idx, clf_results, training_points), interpolate_from_idx(
+        perf, pvp_idx, pvp_results, training_points
+    )
+def data_difference(perf, overlapping_range, training_points, clf_results, pvp_results):
+    x1, x2 = interpolate_from_perf(perf, overlapping_range, training_points, clf_results, pvp_results)
+    return x1 - x2
+def calculate_overlap(clf_results, pvp_results, full_range=False):
+    if full_range:
+        return (min(min(clf_results), min(pvp_results)), max(max(clf_results), max(pvp_results)))
+    else:
+        return (max(min(clf_results), min(pvp_results)), min(max(clf_results), max(pvp_results)))
+def calculate_range(overlapping_range, number_of_points):
+    integral_range = (
+        overlapping_range[0] + i / (number_of_points + 1) * (overlapping_range[1] - overlapping_range[0])
+        for i in range(1, number_of_points + 1)
+    )
+    return integral_range
+def calculate_differences(integral_range, overlapping_range, training_points, clf_results, pvp_results):
+    differences = [
+        data_difference(y, overlapping_range, training_points, clf_results, pvp_results) for y in integral_range
+    ]
+    return differences
+def calculate_offset(training_points, clf_results, pvp_results, number_of_points=1000):
+    overlapping_range = calculate_overlap(clf_results, pvp_results)
+    integral_range = calculate_range(overlapping_range, number_of_points)
+    differences = calculate_differences(integral_range, overlapping_range, training_points, clf_results, pvp_results)
+    offset = sum(differences) / number_of_points
+    return offset
+def intersection_with_range(training_points, results, band):
+    result_polygon = Polygon(
+        [(training_points[i], results[i]) for i in range(len(training_points))]
+        + [(training_points[-1], 0), (training_points[0], 0)]
+    )
+    return result_polygon.intersection(band)
+def fill_polygon(fig, polygon, color, label=None, alpha=1.0):
+    if polygon.is_empty or isinstance(polygon, shapely.geometry.LineString):
+        return
+    if isinstance(polygon, Polygon):
+        xs, ys = polygon.exterior.xy
+        fig.patch(xs, ys, color=color, alpha=alpha)
+    else:
+        for geom in polygon.geoms:
+            if isinstance(geom, shapely.geometry.LineString):
+                continue
+            xs, ys = geom.exterior.xy
+            fig.patch(xs, ys, color=color, alpha=alpha)
+            label = None
+label_order = {
+    "head run": 0,
+    "head advantage": 1,
+    "control run": 2,
+    "optimization advantage": 3,
+    "prompting run": 4,
+    "semantics advantage": 5,
+    "region of comparison": 6,
+}
+def metric_tap(
+        event, overlapping_range, training_points, clf_results, pvp_results, advantage_box, advantage_plot
+):
+    _, metric_value = event.x, event.y
+    try:
+        advantage_value = data_difference(metric_value, overlapping_range, training_points, clf_results, pvp_results)
+        advantage_box.text = advantage_text(advantage_value)
+        if not isinstance(advantage_plot.renderers[-1], Span):
+            metric_line = Span(
+                location=metric_value,
+                line_alpha=0.7,
+                dimension="width",
+                line_color=clf_colors[0] if advantage_value < 0 else pvp_colors[0],
+                line_dash="dashed",
+                line_width=1,
+            )
+            advantage_plot.renderers.extend([metric_line])
+        else:
+            advantage_plot.renderers[-1].location = metric_value
+            advantage_plot.renderers[-1].line_color = clf_colors[0] if advantage_value < 0 else pvp_colors[0]
+    # clicking outside the region
+    except ValueError:
+        pass
+def plot_polygons_bokeh(task, training_points, clf_results, pvp_results, clf_colors, pvp_colors, x_log_scale=False):
+    overlapping_range = calculate_overlap(clf_results, pvp_results, False)
+    full_range = calculate_overlap(clf_results, pvp_results, True)
+    middle_y = (full_range[0] + full_range[1]) / 2
+    fig = figure(plot_height=400, plot_width=800, max_height=400, max_width=800,
+                 x_axis_type="log" if x_log_scale else "linear", title="Performance over training subset sizes of head and prompting methods")
+    fig.circle(training_points, clf_results, color=clf_colors[0], legend="head run")
+    fig.circle(training_points, pvp_results, color=pvp_colors[0], legend="prompting run")
+    fig.line(training_points, clf_results, color=clf_colors[0], alpha=1)
+    fig.line(training_points, pvp_results, color=pvp_colors[0], alpha=1)
+    fig.xaxis.axis_label = "training subset size"
+    fig.yaxis.axis_label = task_metrics[task]
+    fig.patch(
+        [training_points[0], training_points[0], training_points[-1], training_points[-1]],
+        [overlapping_range[0], overlapping_range[1], overlapping_range[1], overlapping_range[0]],
+        color="black",
+        fill_alpha=0,
+        line_width=0,
+        legend="comparison region",
+        hatch_alpha=0.14,
+        hatch_scale=40,
+        hatch_pattern="/",
+    )
+    band = Polygon(
+        [
+            (training_points[0], overlapping_range[0]),
+            (training_points[0], overlapping_range[1]),
+            (training_points[-1], overlapping_range[1]),
+            (training_points[-1], overlapping_range[0]),
+        ]
+    )
+    full_band = Polygon(
+        [
+            (training_points[0], full_range[0]),
+            (training_points[0], full_range[1]),
+            (training_points[-1], full_range[1]),
+            (training_points[-1], full_range[0]),
+        ]
+    )
+    clf_polygon = intersection_with_range(training_points, clf_results, band)
+    pvp_polygon = intersection_with_range(training_points, pvp_results, band)
+    full_clf_polygon = intersection_with_range(training_points, clf_results, full_band)
+    full_pvp_polygon = intersection_with_range(training_points, pvp_results, full_band)
+    clf_inside_area = clf_polygon.difference(pvp_polygon)
+    pvp_inside_area = pvp_polygon.difference(clf_polygon)
+    clf_outside_area = (full_clf_polygon.difference(full_pvp_polygon)).difference(clf_inside_area)
+    pvp_outside_area = (full_pvp_polygon.difference(full_clf_polygon)).difference(pvp_inside_area)
+    fill_polygon(fig, clf_outside_area, clf_colors[1], alpha=0.13)
+    fill_polygon(fig, pvp_outside_area, pvp_colors[1], alpha=0.18)
+    fill_polygon(
+        fig, clf_inside_area, clf_colors[1], alpha=0.4, label="head advantage" if task == "WiC" else None
+    )
+    fill_polygon(fig, pvp_inside_area, pvp_colors[1], alpha=0.4, label="prompting advantage")
+    fig.line([training_points[0], training_points[-1]], [overlapping_range[0], overlapping_range[0]], color="dimgrey")
+    fig.line([training_points[0], training_points[-1]], [overlapping_range[1], overlapping_range[1]], color="dimgrey")
+    vline = Span(
+        location=training_points[-1], dimension="height", line_color="black", line_width=2.5, line_dash="dashed"
+    )
+    end_label = Label(
+        x=training_points[-1], y=middle_y, text="End of dataset", angle=90, angle_units="deg", text_align="center"
+    )
+    fig.renderers.extend([vline, end_label])
+    fig.legend.location = "bottom_right"
+    return fig
+def plot_three_polygons_bokeh(
+        task, training_points, clf_results, pvp_results, ctl_results, clf_colors, pvp_colors, ctl_colors,
+        x_log_scale=False
+):
+    overlapping_range = calculate_overlap(clf_results, pvp_results, False)
+    full_range = calculate_overlap(clf_results, pvp_results, True)
+    middle_y = (full_range[0] + full_range[1]) / 2
+    fig = figure(plot_height=400, plot_width=800, max_height=400, max_width=800,
+                 x_axis_type="log" if x_log_scale else "linear", title="Performance over training subset sizes of head, prompting and prompting with a null verbalizer")
+    fig.xaxis.axis_label = "training subset size"
+    fig.yaxis.axis_label = task_metrics[task]
+    fig.circle(training_points, clf_results, color=clf_colors[0], legend="head run")
+    fig.circle(training_points, pvp_results, color=pvp_colors[0], legend="prompting run")
+    fig.circle(training_points, ctl_results, color=ctl_colors[0], legend="null verbalizer run")
+    fig.line(training_points, clf_results, color=clf_colors[0], alpha=1)
+    fig.line(training_points, pvp_results, color=pvp_colors[0], alpha=1)
+    fig.line(training_points, ctl_results, color=ctl_colors[0], alpha=1)
+    fig.patch(
+        [training_points[0], training_points[0], training_points[-1], training_points[-1]],
+        [overlapping_range[0], overlapping_range[1], overlapping_range[1], overlapping_range[0]],
+        color="black",
+        fill_alpha=0,
+        line_width=0,
+        legend="comparison region",
+        hatch_alpha=0.14,
+        hatch_scale=40,
+        hatch_pattern="/",
+    )
+    band = Polygon(
+        [
+            (training_points[0], overlapping_range[0]),
+            (training_points[0], overlapping_range[1]),
+            (training_points[-1], overlapping_range[1]),
+            (training_points[-1], overlapping_range[0]),
+        ]
+    )
+    full_band = Polygon(
+        [
+            (training_points[0], full_range[0]),
+            (training_points[0], full_range[1]),
+            (training_points[-1], full_range[1]),
+            (training_points[-1], full_range[0]),
+        ]
+    )
+    clf_polygon = intersection_with_range(training_points, clf_results, band)
+    pvp_polygon = intersection_with_range(training_points, pvp_results, band)
+    ctl_polygon = intersection_with_range(training_points, ctl_results, band)
+    full_clf_polygon = intersection_with_range(training_points, clf_results, full_band)
+    full_pvp_polygon = intersection_with_range(training_points, pvp_results, full_band)
+    full_ctl_polygon = intersection_with_range(training_points, ctl_results, full_band)
+    clf_inside_area = clf_polygon.difference(ctl_polygon)
+    pvp_inside_area = pvp_polygon.difference(clf_polygon).difference(ctl_polygon)
+    ctl_inside_area = ctl_polygon.difference(clf_polygon)
+    clf_outside_area = (full_clf_polygon.difference(full_ctl_polygon)).difference(clf_inside_area)
+    pvp_outside_area = (full_pvp_polygon.difference(full_clf_polygon).difference(ctl_polygon)).difference(
+        pvp_inside_area
+    )
+    ctl_outside_area = (full_ctl_polygon.difference(full_clf_polygon)).difference(pvp_inside_area)
+    fill_polygon(
+        fig, clf_inside_area, clf_colors[1], alpha=0.4, label="head advantage" if task == "WiC" else None
+    )
+    fill_polygon(fig, pvp_inside_area, pvp_colors[1], alpha=0.4, label="prompting advantage")
+    fill_polygon(fig, ctl_inside_area, ctl_colors[1], alpha=0.4, label="null verbalizer advantage")
+    fill_polygon(fig, clf_outside_area, clf_colors[1], alpha=0.13)
+    fill_polygon(fig, pvp_outside_area, pvp_colors[1], alpha=0.18)
+    fill_polygon(fig, ctl_outside_area, ctl_colors[1], alpha=0.13)
+    fig.line([training_points[0], training_points[-1]], [overlapping_range[0], overlapping_range[0]], color="dimgrey")
+    fig.line([training_points[0], training_points[-1]], [overlapping_range[1], overlapping_range[1]], color="dimgrey")
+    vline = Span(
+        location=training_points[-1], dimension="height", line_color="black", line_width=2.5, line_dash="dashed"
+    )
+    end_label = Label(
+        x=training_points[-1], y=middle_y, text="End of dataset", angle=90, angle_units="deg", text_align="center"
+    )
+    fig.renderers.extend([vline, end_label])
+    fig.legend.location = "bottom_right"
+    return fig
+def pattern_graph(task):
+    fig = figure(plot_height=400, plot_width=800, max_height=400, max_width=800, x_axis_type="log", title="Performance over training subset sizes of different prompt patterns")
+    fig.xaxis.axis_label = "training subset size"
+    fig.yaxis.axis_label = task_metrics[task]
+    url = f"https://raw.githubusercontent.com/TevenLeScao/pet/master/exported_results/{task.lower()}/wandb_export.csv"
+    df = pd.read_csv(url)
+    expanded_training_points = np.array(list(df["training_points"]) * task_reps[task] * len(task_patterns[task]))
+    data = np.array(df[[naming_convention(task, seed, pattern) for pattern in task_patterns[task] for seed in
+                        range(task_reps[task])]])
+    data = data.reshape(-1, task_reps[task])
+    col_med = np.nanmean(data, axis=1)
+    # Find indices that you need to replace
+    inds = np.where(np.isnan(data))
+    # Place column means in the indices. Align the arrays using take
+    data[inds] = np.take(col_med, inds[0])
+    data = data.reshape(len(df["training_points"]), -1)
+    data = data.transpose().reshape(-1)
+    data = data + np.random.normal(0, 0.01, len(data))
+    pattern = np.array([i // (len(data) // len(task_patterns[task])) for i in range(len(data))])
+    seed = np.array([0, 1, 2, 3] * (len(data) // task_reps[task]))
+    long_df = pd.DataFrame(np.stack((expanded_training_points, pattern, seed, data), axis=1),
+                           columns=["training_points", "pattern", "seed", task_metrics[task]])
+    long_df['pattern'] = long_df['pattern'].astype(int).astype(str)
+    gby_pattern = long_df.groupby('pattern')
+    pattern_colors = ["royalblue", "darkturquoise", "darkviolet"]
+    for i, (pattern, pattern_df) in enumerate(gby_pattern):
+        gby_training_points = pattern_df.groupby('training_points')
+        x = [training_point for training_point, training_point_df in gby_training_points]
+        y_max = list([np.max(training_point_df[task_metrics[task]]) for training_point, training_point_df in gby_training_points])
+        y_min = list([np.min(training_point_df[task_metrics[task]]) for training_point, training_point_df in gby_training_points])
+        y = list([np.median(training_point_df[task_metrics[task]]) for training_point, training_point_df in gby_training_points])
+        fig.circle(x, y, color=pattern_colors[i], alpha=1, legend=f"Pattern {i}")
+        fig.line(x, y, color=pattern_colors[i], alpha=1)
+        fig.varea(x=x, y1=y_max, y2=y_min, color=pattern_colors[i], alpha=0.11)
+        # source = ColumnDataSource(data=dict(base=x, lower=y_min, upper=y_max))
+        # w = Whisker(source=source, base="base", upper="upper", lower="lower", line_color=pattern_colors[i], line_alpha=0.3)
+        # w.upper_head.line_color = pattern_colors[i]
+        # w.lower_head.line_color = pattern_colors[i]
+        # fig.add_layout(w)
+    return fig
+def cubic_easing(t):
+    if t < 0.5:
+        return 4 * t * t * t
+    p = 2 * t - 2
+    return 0.5 * p * p * p + 1
+def circ_easing(t):
+    if t < 0.5:
+        return 0.5 * (1 - math.sqrt(1 - 4 * (t * t)))
+    return 0.5 * (math.sqrt(-((2 * t) - 3) * ((2 * t) - 1)) + 1)

naacl_demo/main.py ADDED Viewed

	@@ -0,0 +1,294 @@

+from bokeh.events import Tap
+from bokeh.io import curdoc
+from bokeh.layouts import column
+from bokeh.models import Div, TextInput, RadioButtonGroup, TextAreaInput, Span, Button, Panel, Tabs
+from bokeh.models.tools import CrosshairTool
+from demo_utils import (
+    get_data,
+    prompt_boolq,
+    pvp_colors,
+    ctl_colors,
+    clf_colors,
+    reduct,
+    task_best_pattern,
+    plot_polygons_bokeh,
+    advantage_text,
+    data_difference,
+    calculate_overlap,
+    circ_easing,
+    average_advantage_text,
+    plot_three_polygons_bokeh,
+    tasks,
+    metric_tap,
+    neutral_tasks, pattern_graph,
+)
+from text import text1, text2, text3, text4, initial_passage, initial_question, text5
+########################################################################################################################
+# Basic dimensions
+########################################################################################################################
+plot_width = 1200
+plot_height = 400
+sidebar_width = 400
+in_text_plot_height = 300
+text_width = 800
+widget_size = 400
+########################################################################################################################
+# Patternification widget
+########################################################################################################################
+passage = TextAreaInput(title="Passage", rows=3, value=initial_passage, max_width=text_width)
+passage.align = "center"
+question = TextInput(title="Question", value=initial_question, max_width=text_width)
+question.align = "center"
+radio_button_group = RadioButtonGroup(labels=["Pattern 1", "Pattern 2", "Pattern 3"], active=0, max_width=text_width)
+radio_button_group.align = "center"
+box_style = {
+    "display": "block",
+    "margin": "0 auto",
+    "width": f"{text_width}px",
+    "text-align": "center",
+    "white-space": "pre-wrap",
+    "background": "#f4f4f4",
+    "border": "1px solid #ddd",
+    # "border-left": "3px solid #4d4945",
+    "color": "#666",
+    "page-break-inside": "avoid",
+    # "font-family": "monospace",
+    "font-size": "15px",
+    "line-height": "1.6",
+    "max-width": "100%",
+    "overflow": "hidden",
+    "min-height": "30px",
+    "word-wrap": "break-word",
+}
+prompt_box = Div(
+    text=prompt_boolq(passage.value, question.value, radio_button_group.active),
+    width=text_width,
+    style=box_style,
+    sizing_mode="scale_width",
+)
+prompt_box.align = "center"
+def update_prompt(attrname, old, new):
+    prompt_box.text = prompt_boolq(passage.value, question.value, radio_button_group.active)
+passage.on_change("value", update_prompt)
+question.on_change("value", update_prompt)
+radio_button_group.on_change("active", update_prompt)
+patternification = column(passage, question, radio_button_group, prompt_box, sizing_mode="scale_width")
+patternification.align = "center"
+########################################################################################################################
+# Advantage diagram
+########################################################################################################################
+advantage_plots_per_task = []
+overlapping_range_per_task = []
+training_points_per_task = []
+clf_results_per_task = []
+pvp_results_per_task = []
+advantage_tabs = []
+advantage_all_figures = Tabs(tabs=advantage_tabs)
+advantage_box = Div(
+    text="Click within the comparison region to compute the data advantage for a performance level",
+    width=text_width,
+    style=box_style,
+    sizing_mode="scale_width",
+)
+advantage_box.align = "center"
+for task in tasks:
+    training_points, classifier_performances, pattern_performances = get_data(task)
+    training_points_per_task.append(list(training_points))
+    clf_results_per_task.append(reduct(classifier_performances, "accmax"))
+    pvp_results_per_task.append(reduct(pattern_performances, "accmax", task_best_pattern[task], "normal"))
+    advantage_plots_per_task.append(plot_polygons_bokeh(
+        task, training_points_per_task[-1], clf_results_per_task[-1], pvp_results_per_task[-1], clf_colors,
+        pvp_colors
+    ))
+    advantage_plots_per_task[-1].align = "center"
+    advantage_plots_per_task[-1].add_tools(CrosshairTool(dimensions="width", line_alpha=0.2))
+    overlapping_range_per_task.append(calculate_overlap(clf_results_per_task[-1], pvp_results_per_task[-1]))
+    advantage_tabs.append(Panel(child=advantage_plots_per_task[-1], title=task))
+    advantage_plots_per_task[-1].on_event(
+        Tap,
+        lambda event: metric_tap(
+            event,
+            overlapping_range_per_task[advantage_all_figures.active],
+            training_points_per_task[advantage_all_figures.active],
+            clf_results_per_task[advantage_all_figures.active],
+            pvp_results_per_task[advantage_all_figures.active],
+            advantage_box,
+            advantage_plots_per_task[advantage_all_figures.active],
+        ),
+    )
+    if task == "MNLI":
+        training_points_per_task.append(list(training_points))
+        clf_results_per_task.append(reduct(classifier_performances, "accmax"))
+        pvp_results_per_task.append(reduct(pattern_performances, "accmax", task_best_pattern[task], "normal"))
+        advantage_plots_per_task.append(plot_polygons_bokeh(
+            task, training_points_per_task[-1], clf_results_per_task[-1], pvp_results_per_task[-1], clf_colors,
+            pvp_colors, x_log_scale=True
+        ))
+        advantage_plots_per_task[-1].align = "center"
+        advantage_plots_per_task[-1].add_tools(CrosshairTool(dimensions="width", line_alpha=0.2))
+        overlapping_range_per_task.append(calculate_overlap(clf_results_per_task[-1], pvp_results_per_task[-1]))
+        advantage_tabs.append(Panel(child=advantage_plots_per_task[-1], title="MNLI (log scale)"))
+        advantage_plots_per_task[-1].on_event(
+            Tap,
+            lambda event: metric_tap(
+                event,
+                overlapping_range_per_task[advantage_all_figures.active],
+                training_points_per_task[advantage_all_figures.active],
+                clf_results_per_task[advantage_all_figures.active],
+                pvp_results_per_task[advantage_all_figures.active],
+                advantage_box,
+                advantage_plots_per_task[advantage_all_figures.active],
+            ),
+        )
+advantage_all_figures = Tabs(tabs=advantage_tabs)
+advantage_all_figures.align = "center"
+def on_integrate_click():
+    frames = 200
+    initial_placement = overlapping_range_per_task[advantage_all_figures.active][0]
+    if not isinstance(advantage_plots_per_task[advantage_all_figures.active].renderers[-1], Span):
+        metric_line = Span(
+            location=initial_placement,
+            line_alpha=0.7,
+            dimension="width",
+            line_color=clf_colors[0] if initial_placement < 0 else pvp_colors[0],
+            line_dash="dashed",
+            line_width=1,
+        )
+        advantage_plots_per_task[advantage_all_figures.active].renderers.extend([metric_line])
+    else:
+        advantage_plots_per_task[advantage_all_figures.active].renderers[-1].location = initial_placement
+        advantage_plots_per_task[advantage_all_figures.active].renderers[-1].line_color = clf_colors[
+            0] if initial_placement < 0 else pvp_colors[0]
+    average_advantage = 0
+    for i in range(1, frames):
+        metric_value = overlapping_range_per_task[advantage_all_figures.active][0] + (
+                overlapping_range_per_task[advantage_all_figures.active][1] -
+                overlapping_range_per_task[advantage_all_figures.active][0]) * (i / frames)
+        advantage_value = data_difference(metric_value, overlapping_range_per_task[advantage_all_figures.active],
+                                          training_points_per_task[advantage_all_figures.active],
+                                          clf_results_per_task[advantage_all_figures.active],
+                                          pvp_results_per_task[advantage_all_figures.active])
+        average_advantage = ((i - 1) * average_advantage + advantage_value) / i
+        advantage_plots_per_task[advantage_all_figures.active].renderers[-1].location = metric_value
+        advantage_plots_per_task[advantage_all_figures.active].renderers[-1].line_color = clf_colors[
+            0] if advantage_value < 0 else pvp_colors[0]
+        advantage_box.text = average_advantage_text(average_advantage)
+integrate = Button(width=175, max_width=175, label="Integrate over the whole region!")
+integrate.align = "center"
+integrate.on_click(on_integrate_click)
+def on_tab_change(attr, old, new):
+    advantage_box.text = "Click within the comparison region to compute the data advantage for a performance level"
+advantage_all_figures.on_change('active', on_tab_change)
+advantage_column = column(advantage_all_figures, advantage_box, integrate, sizing_mode="scale_width")
+########################################################################################################################
+# Null verbalizer diagram
+########################################################################################################################
+null_tabs = []
+null_all_figures = Tabs(tabs=null_tabs)
+for task in neutral_tasks:
+    training_points, classifier_performances, pattern_performances = get_data(task)
+    training_points = list(training_points)
+    clf_results = reduct(classifier_performances, "accmax")
+    pvp_results = reduct(pattern_performances, "accmax", task_best_pattern[task], "normal")
+    ctl_results = reduct(pattern_performances, "accmax", task_best_pattern[task], "neutral")
+    null_plot = plot_three_polygons_bokeh(task, training_points, clf_results, pvp_results, ctl_results, clf_colors,
+                                          pvp_colors, ctl_colors)
+    null_plot.align = "center"
+    null_plot.add_tools(CrosshairTool(dimensions="width", line_alpha=0.2))
+    null_tabs.append(Panel(child=null_plot, title=task))
+    if task == "MNLI":
+        null_plot = plot_three_polygons_bokeh(task, training_points, clf_results, pvp_results, ctl_results, clf_colors,
+                                              pvp_colors, ctl_colors, x_log_scale=True)
+        null_plot.align = "center"
+        null_plot.add_tools(CrosshairTool(dimensions="width", line_alpha=0.2))
+        null_tabs.append(Panel(child=null_plot, title="MNLI (log scale)"))
+null_all_figures = Tabs(tabs=null_tabs)
+null_all_figures.align = "center"
+########################################################################################################################
+# Patterns diagram
+########################################################################################################################
+pattern_tabs = []
+pattern_all_figures = Tabs(tabs=pattern_tabs)
+for task in tasks:
+    pattern_plot = pattern_graph(task)
+    pattern_plot.align = "center"
+    pattern_plot.add_tools(CrosshairTool(dimensions="width", line_alpha=0.2))
+    pattern_tabs.append(Panel(child=pattern_plot, title=task))
+pattern_all_figures = Tabs(tabs=pattern_tabs)
+pattern_all_figures.align = "center"
+########################################################################################################################
+# Add write-up text
+########################################################################################################################
+main_text_style = {
+    "min-height": "100px",
+    "overflow": "hidden",
+    "display": "block",
+    "margin": "auto",
+    "width": f"{text_width}px",
+    "font-size": "18px",
+}
+textbox1 = Div(text=text1, style=main_text_style)
+textbox2 = Div(text=text2, style=main_text_style)
+textbox3 = Div(text=text3, style=main_text_style)
+textbox4 = Div(text=text4, style=main_text_style)
+textbox5 = Div(text=text5, style=main_text_style)
+textbox1.align = "center"
+textbox2.align = "center"
+textbox3.align = "center"
+textbox4.align = "center"
+textbox5.align = "center"
+########################################################################################################################
+# Set up layouts and add to document
+########################################################################################################################
+main_body = column(textbox1, patternification, textbox2, advantage_column, textbox3, null_all_figures, textbox4, pattern_all_figures,
+                   textbox5, sizing_mode="scale_width")
+main_body.align = "center"
+curdoc().add_root(main_body)
+curdoc().title = "How many data points is a prompt worth ?"

naacl_demo/text.md ADDED Viewed

	@@ -0,0 +1,82 @@

+Pre-trained language models, fine-tuned with task-specific heads, are the backbone of applied NLP, and bigger and bigger language models are coming. With this in mind, alternative methods are emerging to compete with the classifier heads used in BERT, UniLM and GPT. In particular, GPT-3 has popularized prompts, natural language inputs designed to steer the pre-trained language model itself into solving the task, rather than a classifier built on top of it.
+Prompts are interesting because they allow a practitioner to give information to the model, although in a very different fashion from standard ML supervision. In our NAACL 2021 paper, we investigate prompt-based fine-tuning, a promising alternative fine-tuning approach, and find that prompts often yield an edge over the standard approach. As we interpret a prompt as additional human-crafted information for the model, we measure that edge in terms of data points and quantify: **how many data points is a prompt worth?**
+## Prompting
+In order to adapt pre-trained language models to a task, the main method is to replace the final token prediction layer of the original model with a randomly initialized linear classifier head. Supervised task data is then used to train the modified model via backpropagation, learning weights for this new head but also modifying weights deeper in the model. In this work, we call this a _head_ model.
+A competing approach is _prompting_: a broad class of methods that attempt to use the initial language model to answer the task by predicting words correlated with the classes instead of a class label. This allows them to perform classification while preserving the language model functionality. For this, _prompts_ are used: input sequences designed to produce the desired answer as textual output.
+Although this may sound abstract, this is a very natural way to reason about text for humans in practice: school exercises, for example, tend to be presented as a text input (for example, an article about Mars) and a question ("Is there life on Mars?") with an expected answer in natural text ("No"<sup>1</sup>) that maps to one of the classes of the task (presumably here, "No" to `False` and "Yes" to `True`). In this paradigm, task-specific data is presented to the model much like a grammar exercise where a student would need to fill in blanks in a fixed way over a list of sequences. Prompting attempts to use the pre-training information contained in the language model explicitly, rather than implicitly through hidden representations that get fed into the linear classifier head.
+Here's an example for SuperGLUE task BoolQ, which provides a text <span style="color: #0c593d">passage</span> and a <span style="color: #031154">question</span> and expects a boolean yes-or-no answer. This data is combined with a <span style="color: #910713">**pattern**</span> into a sequence with a single <span style="color: #ba9004">**masked token**</span> that the model must predict. This prediction is turned into a classification prediction with a pre-set *verbalizer*, a mapping between tokens and classes: the model probabilities on this token for *yes* and *no* are compared, with the final prediction being `True` if *yes* dominates and `False` if *no* does.
+![image](mockups/boolqpatterns.png)
+## Fine-tuning
+With this, we have turned our general language model into a task-specific classifier. These language model classifiers based on prompts have been used in very diverse ways:
+- The preserved language modeling functionality from the pre-trained model allows them to perform without additional data, as opposed to linear classifier _heads_ that are initialized from scratch and always start at random performance. A variety of papers have used this for zero-shot classification.
+- In order to incorporate supervised task data, they can use backpropagation with the usual language modeling cross-entropy loss objective: the verbalizer token associated with the correct class then serves as the correct token prediction. This is a component of PET, and is the objective used by T5 - although T5 uses prefixes to indicate the task rather than describing it with a natural-language prompt.
+- They can also use _priming_, where the sequence that needs to be filled in is prefixed with a list of correctly-filled examples. No backpropagation is used, and the weights of the language model are never modified: instead, it can attend to correct examples at inference time. This is the method used by GPT3.
+- Finally, PET uses prompt models to pseudo-label unlabeled data that is then fed to a linear head model.
+In this paper, our goal is to present the fairest comparison possible with head models, so we fine-tune with backpropagation.
+## How many data points is a prompt worth?
+As we have seen, both heads and prompting can be used in a task specific supervised setting. The core difference is that the prompted model is given a specific sentence that roughly describes the task in addition to supervised examples. In some sense, this sentence is supervision as it tells the model about the task, but it is qualitatively a very different form of supervision than is standard in ML. How should we think about this supervision? How do we quantify how “zero-shot” this setup really is?
+We do this by comparing the _head_ and _prompt_ setups on the SuperGLUE tasks and MNLI. For each task, we extract subsets of the dataset of growing size, and repeat fine-tuning on `RoBERTa-large` with both methods on every subset, keeping everything else the same. For fairness, we tune the hyperparameters on the head baseline until they've attained the level of performance of the BERT++ baseline from the SuperGLUE leaderboard, and keep them the same for the _prompt_ model.
+The curves of final performance (on each task's metric) vs dataset size are plotted below for each task <sup>2</sup>. They allow us to contrast the amount of data required to attain a certain level of performance with both setups on a given task. We call this difference the _data advantage_ of a training setup over the other at that level of performance. We call the range of performance that has been attained by both models the _comparison window_. By integrating over it we get the _average data advantage_ of a method over the other on the task. Graphically, that is simply the area between the curves, divided by the height of the comparison window. <sup>3</sup>
+![image](mockups/advantage.png)
+Here's a recapitulative table of the average data advantage of the prompt model over the head model per task, with error bounds obtained by a bootstrapping approach where we hold out one of the 4 head runs and 4 prompt runs (16 combinations total for every data size), and compute the standard deviation of those outcomes. Results are very different from task to task; they even vary for the same task on different dataset, for example for MNLI and RTE, both entailment tasks. However, on every task but WiC <sup>4</sup>, the prompt method has a significant edge. **The additional information provided by the prompt is consistently equivalent to hundreds of data points**.
+|                | MNLI     | BoolQ  | CB   | COPA    | MultiRC<sup>5</sup> | RTE    | WiC     | WSC     |
+|----------------|----------|--------|------|---------|----------|--------|---------|---------|
+| Prompt vs Head | 3506±536 | 752±46 | 90±2 | 288±242 | 384±378  | 282±34 | -424±74 | 281±137 |
+## Patterns and verbalizers
+#### Control verbalizers
+Prompting has for now mostly been used as a tool for zero-shot classification, which is a natural use case. However, zero-shot is usually tricky and requires perfectly aligning the prompt and verbalizer. We have already shown that prompting could be applied more generally, including in the full-data regime. In order to contrast the zero-shot and adaptive natures of prompts, we consider a _null verbalizer_, a control with a verbalizer that is completely decorrelated from the task. For tasks that only require filling in one token (thus excluding the more free-form COPA and WSC), we replace the verbalizers, for example, "yes", "no", "maybe", "right" or "wrong",  with random first names. This makes the model unusable without training data, much like a head model. We plot the corresponding curves and perform the same advantage analysis below:
+![image](mockups/nullverbalizer.png)
+|                | MNLI     | BoolQ  | CB   | MultiRC<sup>4</sup> | RTE    | WiC     |
+|----------------|----------|--------|------|----------|--------|---------|
+| Prompt vs Head | 3506±536 | 752±46 | 90±2 | 384±378  | 282±34 | -424±74 |
+| Prompt vs Null | 150±252  | 299±81 | 78±2 | 74±56    | 404±68 | -354±166 |
+| Null vs Head   | 3355±612 | 453±90 | 12±1 | 309±320  | -122±62 | -70±160 |
+Results are noisier than for the straight prompt vs head comparison; however, we find that even with a null verbalizer, the language model is able to adapt to the task, generally catching up with the proper prompted model even with a few data points, and generally doing either on par with or better than the head model, showing the inductive bias of the prompt patterns is beneficial even without an informative verbalizer.
+#### Influence of the pattern choice
+Another choice that can make or break zero-shot classification is that of the pattern, and we investigate whether that still holds in our setting. In all of our experiments, we have re-used the pattern choices from PET - two or three quite different formulations per task - and repeated all of our prompt experiments with every pattern available on the task. We plot results below; they show that the choice of prompt does not have a significant influence, being always within random seed variance.
+![image](mockups/prompts.png)
+## Mot de la fin
+In this work, we investigate alternate methods of fine-tuning based on natural language prompts, that aim to use the language modeling ability of pre-trained models explicitly through word predictions, instead of implicitly through linear classifiers based on the model's internal representations. We isolate the problem of fine-tuning prompt-based classifier language models with backpropagation, and find that they generally outperform standard fine-tuned linear classifiers. We estimate this advantage in terms of data point to measure the additional information provided by the human via the prompt, and find that **writing a prompt is consistently worth hundreds of data points**. Furthermore, this advantage holds even with non-informative target tokens and is fairly robust to the choice of prompt.
+For practitioners, we believe that prompt-based fine-tuning should become a standard tool: especially for small- and middle-size task-specific datasets, designing a prompt yourself is a small effort for a sizable data advantage. For researchers, we believe that a lot of questions remain unexplored in this space: Why is the same prompt worth 3500 MNLI data points but only 282 RTE data points? How are prompts related to standard ML supervision? Do they react differently to adversarial or out-of domain examples, since they have some zero-shot behaviour?
+<sup>1</sup>: Or at least not that we know of.
+<sup>2</sup>: A sharp-eyed reader will have noticed that all those curves are monotonous. We've performed 4 runs for every experiment (i.e. every data size of every task for head and prompt models). For clarity, and because fine-tuning can sometimes fail for both methods, resulting in negative outliers, we report for every data size the maximum performance that has been attained at this data size or smaller, which we call the _accumulated maximum_ aggregate. This does not have a big impact on the reported data advantage besides reducing variance, and the graphical interpretation would still hold even with non-monotonous curves.
+<sup>3</sup>: We treat each metric linearly to calculate advantage; alternatively, we could re-parameterize the y axis for each task. This choice does not have a consistent effect for or against prompting. For example, emphasizing gains close to convergence increases prompting advantage on CB and MNLI but decreases it on COPA or BoolQ.
+<sup>4</sup>: where, interestingly, PET had already found prompting to be ineffective
+<sup>5</sup>: The comparison window of MultiRC is too small as the head baseline fails to learn beyond majority class; we use the full region for a lower-bound result.

naacl_demo/text.py ADDED Viewed

	@@ -0,0 +1,169 @@

+text1 = """<h1 id="how-big-should-my-language-model-be">How many data points is a prompt worth?</h1>
+<img class='center' style='height: 5em; float: right;' src='https://raw.githubusercontent.com/TevenLeScao/transformer-xl/master/pytorch/assets/avatar_logo_joint.png' alt='avatar'>
+<h4>Published on April 6, 2021.</h4>
+<h4>Teven Le Scao, researcher at Hugging Face • <a href="https://twitter.com/Fluke_Ellington">@Fluke_Ellington</a> </h4>
+<p>Pre-trained language models, fine-tuned with task-specific heads, are the backbone of applied NLP, and bigger and bigger language models are coming. With this in mind, alternative methods are emerging to compete with the classifier heads used in <a href="https://arxiv.org/abs/1810.04805">BERT</a>, <a href="https://arxiv.org/abs/1905.03197">UniLM</a> and <a href="https://openai.com/blog/language-unsupervised/">GPT</a>. In particular, GPT-3 has popularized prompts, natural language inputs designed to steer the pre-trained language model itself into solving the task, rather than a classifier built on top of it. </p>
+<p>Prompts are interesting because they allow a practitioner to give information to the model, although in a very different fashion from standard ML supervision. In our NAACL 2021 <a href="https://arxiv.org/abs/2103.08493">paper</a> with <a href="http://rush-nlp.com/">Sasha Rush</a>, we investigate prompt-based fine-tuning, a promising alternative fine-tuning approach, and find that prompts often yield an edge over the standard approach. As we interpret a prompt as additional human-crafted information for the model, we measure that edge in terms of data points and quantify: <strong>how many data points is a prompt worth?</strong> </p>
+<h2 id="prompting">Prompting</h2>
+<p>In order to adapt pre-trained language models to a task, the main method is to replace the final token prediction layer of the original model with a randomly initialized linear classifier head. Supervised task data is then used to train the modified model via backpropagation, learning weights for this new head but also modifying weights deeper in the model. In this work, we call this a <em>head</em> model. </p>
+<p>A competing approach is <em>prompting</em>: a broad class of methods that attempt to use the initial language model to answer the task by predicting words correlated with the classes instead of a class label. This allows them to perform classification while preserving the language model functionality. For this, <em>prompts</em> are used: input sequences designed to produce the desired answer as textual output. </p>
+<p id="footnote1back">Although this may sound abstract, this is a very natural way to reason about text for humans in practice: school exercises, for example, tend to be presented as a text input (for example, an article about Mars) and a question (&quot;Is there life on Mars?&quot;) with an expected answer in natural text (&quot;No&quot;<a href="#footnote1"><sup>1</sup></a>) that maps to one of the classes of the task (presumably here, &quot;No&quot; to <code>False</code> and &quot;Yes&quot; to <code>True</code>). In this paradigm, task-specific data is presented to the model much like a grammar exercise where a student would need to fill in blanks in a fixed way over a list of sequences. Prompting attempts to use the pre-training information contained in the language model explicitly, rather than implicitly through hidden representations that get fed into the linear classifier head.  </p>
+<p>Here&#39;s an example for <a href="https://arxiv.org/abs/1905.00537">SuperGLUE</a> task <a href="https://arxiv.org/abs/1905.10044">BoolQ</a>, which provides a text <span style="color: #0c593d">passage</span> and a <span style="color: #031154">question</span> and expects a boolean yes-or-no answer. This data is combined with a <span style="color: #910713"><strong>pattern</strong></span> into a sequence with a single <span style="color: #ba9004"><strong>masked token</strong></span> that the model must predict. This prediction is turned into a classification prediction with a pre-set <em>verbalizer</em>, a mapping between tokens and classes: the model probabilities on this token for <em>yes</em> and <em>no</em> are compared, with the final prediction being <code>True</code> if <em>yes</em> dominates and <code>False</code> if <em>no</em> does.</p>
+"""
+text2 = """<h2 id="fine-tuning">Fine-tuning</h2>
+<p>With this, we have turned our general language model into a task-specific classifier. These language model classifiers based on prompts have been used in very diverse ways:  </p>
+<ul>
+<li>The preserved language modeling functionality from the pre-trained model allows them to perform without additional data, as opposed to linear classifier <em>heads</em> that are initialized from scratch and always start at random performance. A variety of papers have used this for <a href="https://arxiv.org/abs/1912.10165">zero-shot classification.</a>  </li>
+<li>In order to incorporate supervised task data, they can use backpropagation with the usual language modeling cross-entropy loss objective: the verbalizer token associated with the correct class then serves as the correct token prediction. This is a component of <a href="https://arxiv.org/abs/2001.07676">PET</a>, and is the objective used by <a href="https://arxiv.org/abs/1910.10683">T5</a> - although T5 uses prefixes to indicate the task rather than describing it with a natural-language prompt.  </li>
+<li>They can also use <em>priming</em>, where the sequence that needs to be filled in is prefixed with a list of correctly-filled examples. No backpropagation is used, and the weights of the language model are never modified: instead, it can attend to correct examples at inference time. This is the method used by <a href="https://arxiv.org/abs/2005.14165">GPT-3</a>.  </li>
+<li>Finally, PET uses prompt models to pseudo-label unlabeled data that is then fed to a linear head model.  </li>
+</ul>
+<p>In this paper, our goal is to present the fairest comparison possible with head models, so we fine-tune with backpropagation.</p>
+<h2 id="how-many-data-points-is-a-prompt-worth-">How many data points is a prompt worth?</h2>
+<p>As we have seen, both heads and prompting can be used in a task specific supervised setting. The core difference is that the prompted model is given a specific sentence that roughly describes the task in addition to supervised examples. In some sense, this sentence is supervision as it tells the model about the task, but it is qualitatively a very different form of supervision than is standard in ML. How should we think about this supervision? How do we quantify how “zero-shot” this setup really is?  </p>
+<p>We do this by comparing the <em>head</em> and <em>prompt</em> setups on the SuperGLUE tasks and MNLI. For each task, we extract subsets of the dataset of growing size, and repeat fine-tuning on <a href="https://arxiv.org/abs/1907.11692"><code>RoBERTa-large</code></a> with both methods on every subset, keeping everything else the same. For fairness, we tune the hyperparameters on the head baseline until they&#39;ve attained the level of performance of the BERT++ baseline from the SuperGLUE leaderboard, and keep them the same for the <em>prompt</em> model. </p>
+<p id="footnote2back">The curves of final performance (on each task&#39;s metric) vs dataset size are plotted below for each task <a href="#footnote2"><sup>2</sup></a>. They allow us to contrast the amount of data required to attain a certain level of performance with both setups on a given task. We call this difference the <em>data advantage</em> of a training setup over the other at that level of performance. We call the range of performance that has been attained by both models the <em>comparison window</em>. By integrating over it we get the <em>average data advantage</em> of a method over the other on the task. Graphically, that is simply the area between the curves, divided by the height of the comparison window. <a href="#footnote3"><sup>3</sup></a></p>
+"""
+text3 = """<html>
+<head>
+<style>
+table, th, td {
+  border: 1px solid black;
+  border-collapse: collapse;
+}
+.styled-table {
+  margin-left: auto;
+  margin-right: auto;
+}
+.styled-table {
+    border-collapse: collapse;
+    font-size: 1em;
+    font-family: sans-serif;
+    min-width: 400px;
+    box-shadow: 0 0 20px rgba(0, 0, 0, 0.15);
+}
+.styled-table thead tr {
+    background-color: #ffebcd;
+    color: #000000;
+    text-align: left;
+}
+.styled-table th,
+.styled-table td {
+    padding: 6px 8px;
+    font-size: 13px;
+}
+.styled-table tbody tr {
+    border-bottom: 1px solid #dddddd;
+}
+.styled-table tbody tr:nth-of-type(even) {
+    background-color: #f3f3f3;
+}
+.styled-table tbody tr:last-of-type {
+    border-bottom: 2px solid #29004a;
+}
+}
+</style>
+</head>
+<body>
+<p id="footnote4back">Here&#39;s a recapitulative table of the average data advantage of the prompt model over the head model per task, with error bounds obtained by a bootstrapping approach where we hold out one of the 4 head runs and 4 prompt runs (16 combinations total for every data size), and compute the standard deviation of those outcomes. Results are very different from task to task; they even vary for the same task on different dataset, for example for MNLI and RTE, both entailment tasks. However, on every task but WiC <a href="#footnote4"><sup>4</sup></a>, the prompt method has a significant edge. <strong>The additional information provided by the prompt is consistently equivalent to hundreds of data points</strong>.  </p>
+<table id="footnote5back" class="styled-table">
+<thead>
+<tr>
+<th></th>
+<th><a href="https://arxiv.org/abs/1704.05426">MNLI</a></th>
+<th><a href="https://arxiv.org/abs/1905.10044">BoolQ</a></th>
+<th><a href="https://ojs.ub.uni-konstanz.de/sub/index.php/sub/article/view/601">CB</a></th>
+<th><a href="https://people.ict.usc.edu/~gordon/publications/AAAI-SPRING11A.PDF">COPA</a></th>
+<th><a href="https://www.aclweb.org/anthology/N18-1023/">MultiRC</a><sup><a href="#footnote5">5</a></sup></th>
+<th><a href="https://link.springer.com/chapter/10.1007/978-94-024-0881-2_42">RTE</a></th>
+<th><a href="https://arxiv.org/abs/1808.09121">WiC</a></th>
+<th><a href="https://arxiv.org/abs/1808.09121">WSC</a></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>Prompt vs Head</td>
+<td>3506±536</td>
+<td>752±46</td>
+<td>90±2</td>
+<td>288±242</td>
+<td>384±378</td>
+<td>282±34</td>
+<td>-424±74</td>
+<td>281±137</td>
+</tr>
+</tbody>
+</table>
+<h2 id="patterns-and-verbalizers">Patterns and verbalizers</h2>
+<h4 id="control-verbalizers">Control verbalizers</h4>
+<p>Prompting has for now mostly been used as a tool for zero-shot classification, which is a natural use case. However, zero-shot is usually tricky and requires perfectly aligning the prompt and verbalizer. We have already shown that prompting could be applied more generally, including in the full-data regime. In order to contrast the zero-shot and adaptive natures of prompts, we consider a <em>null verbalizer</em>, a control with a verbalizer that is completely decorrelated from the task. For tasks that only require filling in one token (thus excluding the more free-form COPA and WSC), we replace the verbalizers, for example, &quot;yes&quot;, &quot;no&quot;, &quot;maybe&quot;, &quot;right&quot; or &quot;wrong&quot;,  with random first names. This makes the model unusable without training data, much like a head model. We plot the corresponding curves and perform the same advantage analysis below:</p>
+</body>
+</html>
+"""
+text4 = """<table id="footnote6back" class="styled-table">
+<thead>
+<tr>
+<th></th>
+<th>MNLI</th>
+<th>BoolQ</th>
+<th>CB</th>
+<th>MultiRC<a href="#footnote5"><sup>6</sup></a></th>
+<th>RTE</th>
+<th>WiC</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>Prompt vs Head</td>
+<td>3506±536</td>
+<td>752±46</td>
+<td>90±2</td>
+<td>384±378</td>
+<td>282±34</td>
+<td>-424±74</td>
+</tr>
+<tr>
+<td>Prompt vs Null</td>
+<td>150±252</td>
+<td>299±81</td>
+<td>78±2</td>
+<td>74±56</td>
+<td>404±68</td>
+<td>-354±166</td>
+</tr>
+<tr>
+<td>Null vs Head</td>
+<td>3355±612</td>
+<td>453±90</td>
+<td>12±1</td>
+<td>309±320</td>
+<td>-122±62</td>
+<td>-70±160</td>
+</tr>
+</tbody>
+</table>
+<p>Results are noisier than for the straight prompt vs head comparison; however, we find that even with a null verbalizer, the language model is able to adapt to the task, generally catching up with the proper prompted model even with a few data points, and generally doing either on par with or better than the head model, showing the inductive bias of the prompt patterns is beneficial even without an informative verbalizer.  </p>
+<h4 id="influence-of-the-pattern-choice">Influence of the pattern choice</h4>
+<p>Another choice that can make or break zero-shot classification is that of the pattern, and we investigate whether that still holds in our setting. In all of our experiments, we have re-used the pattern choices from PET - two or three quite different formulations per task - and repeated all of our prompt experiments with every pattern available on the task. We plot the median, maximum and minimum performance over the 4 runs for each pattern below; they show that the choice of prompt does not generally have a significant influence, with only the few-shot settings of BoolQ and WiC seeing a pattern consistently above the others.  </p>
+"""
+text5 = """<h2 id="mot-de-la-fin">Mot de la fin</h2>
+<p>In this work, we investigate alternate methods of fine-tuning based on natural language prompts, that aim to use the language modeling ability of pre-trained models explicitly through word predictions, instead of implicitly through linear classifiers based on the model&#39;s internal representations. We isolate the problem of fine-tuning prompt-based classifier language models with backpropagation, and find that they generally outperform standard fine-tuned linear classifiers. We estimate this advantage in terms of data point to measure the additional information provided by the human via the prompt, and find that <strong>writing a prompt is consistently worth hundreds of data points</strong>. Furthermore, this advantage holds even with non-informative target tokens and is fairly robust to the choice of prompt. </p>
+<p>For practitioners, we believe that prompt-based fine-tuning should become a standard tool: especially for small- and middle-size task-specific datasets, designing a prompt yourself is a small effort for a sizable data advantage. For researchers, we believe that a lot of questions remain unexplored in this space: Why is the same prompt worth 3500 MNLI data points but only 282 RTE data points? How are prompts related to standard ML supervision? Do they react differently to adversarial or out-of domain examples, since they have some zero-shot behaviour?</p>
+<p id="footnote1"><sup><a href="#footnote1back">1</a></sup>: Or at least not that we know of.</p>
+<p id="footnote2"><sup><a href="#footnote2back">2</a></sup>: A sharp-eyed reader will have noticed that all those curves are monotonous. We&#39;ve performed 4 runs for every experiment (i.e. every data size of every task for head and prompt models). For clarity, and because fine-tuning can sometimes fail for both methods, resulting in negative outliers, we report for every data size the maximum performance that has been attained at this data size or smaller, which we call the <em>accumulated maximum</em> aggregate. This does not have a big impact on the reported data advantage besides reducing variance, and the graphical interpretation would still hold even with non-monotonous curves. </p>
+<p id="footnote3"><sup><a href="#footnote2back">3</a></sup>: We treat each metric linearly to calculate advantage; alternatively, we could re-parameterize the y axis for each task. This choice does not have a consistent effect for or against prompting. For example, emphasizing gains close to convergence increases prompting advantage on CB and MNLI but decreases it on COPA or BoolQ. </p>
+<p id="footnote4"><sup><a href="#footnote4back">4</a></sup>: where, interestingly, PET had already found prompting to be ineffective</p>
+<p id="footnote5"><sup><a href="#footnote5back">5</a> <a href="#footnote6back">6</a></sup>: The comparison window of MultiRC is too small as the head baseline fails to learn beyond majority class; we use the full region for a lower-bound result.</p>
+"""
+initial_passage = "In informal games, it is customary to announce 'check' when making a move that puts the opponent's king in check. In formal competitions, however, check is rarely announced."
+initial_question = "do you always have to say check in chess?"

requirements.txt ADDED Viewed

	@@ -0,0 +1,22 @@

+bokeh==2.3.0
+cycler==0.10.0
+Jinja2==2.11.2
+kiwisolver==1.3.1
+MarkupSafe==1.1.1
+matplotlib==3.4.1
+numpy==1.18.4
+packaging==20.4
+pandas==1.0.3
+Pillow==7.1.2
+pyparsing==2.4.7
+python-dateutil==2.8.1
+pytz==2020.1
+PyYAML==5.3.1
+randomcolor==0.4.4.5
+scipy==1.4.1
+seaborn==0.11.1
+Shapely==1.7.1
+six==1.15.0
+tornado==6.0.4
+typing-extensions==3.7.4.2
+virtualenv-clone==0.5.4