|
import torch |
|
import gradio as gr |
|
from transformers import AlignProcessor, AlignModel |
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
processor = AlignProcessor.from_pretrained("kakaobrain/align-base") |
|
model = AlignModel.from_pretrained("kakaobrain/align-base").to(device) |
|
model.eval() |
|
|
|
|
|
def predict(image, labels): |
|
labels = labels.split(', ') |
|
inputs = processor(images=image, text=labels, return_tensors="pt").to(device) |
|
|
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
|
|
logits_per_image = outputs.logits_per_image |
|
probs = logits_per_image.softmax(dim=1).cpu().numpy() |
|
return {k: float(v) for k, v in zip(labels, probs[0])} |
|
|
|
|
|
description = """ |
|
<div class="container" style="display:flex;"> |
|
<div class="image"> |
|
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/132_vit_align/align.png" alt="ALIGN performance" /> |
|
</div> |
|
<div class="text"> |
|
<p>Gradio demo for <a href="https://huggingface.co/docs/transformers/main/en/model_doc/align">ALIGN</a>, |
|
as introduced in <a href="https://arxiv.org/abs/2102.05918"></a><i>"Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision"</i>. ALIGN features a dual-encoder architecture with EfficientNet and BERT as its text and vision encoders, and learns to align visual and text representations with contrastive learning. |
|
Unlike previous work, ALIGN leverages a massive noisy dataset and shows that the scale of the corpus can be used to achieve SOTA representations with a simple recipe. |
|
\n\nALIGN is not open-sourced and the `kakaobrain/align-base` model used for this demo is based on the Kakao Brain implementation that follows the original paper. The model is trained on the open source [COYO](https://github.com/kakaobrain/coyo-dataset) dataset by the Kakao Brain team. To perform zero-shot image classification with ALIGN, upload an image and enter your candidate labels as free-form text separated by a comma followed by a space.</p> |
|
</div> |
|
</div> |
|
""" |
|
|
|
gr.Interface( |
|
fn=predict, |
|
inputs=[ |
|
gr.inputs.Image(label="Image to classify", type="pil"), |
|
gr.inputs.Textbox(lines=1, label="Comma separated candidate labels", placeholder="Enter labels separated by ', '",) |
|
], |
|
theme="grass", |
|
outputs="label", |
|
examples=[ |
|
["assets/cartoon.jpeg", "dinosaur, drawing, forest",], |
|
["assets/painting.jpeg", "watercolor painting, oil painting, boats",], |
|
], |
|
title="Zero-Shot Image Classification with ALIGN", |
|
description=description |
|
).launch() |
|
|