long8v · long8v · May 6, 2024 · May 6, 2024
diff --git a/LeGrad/LICENSE b/LeGrad/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 Walid Bousselham, Angie Boggust, Sofian Chaybouti,Hendrik Strobelt Hilde Kuehne.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/LeGrad/README.md b/LeGrad/README.md
@@ -0,0 +1,109 @@
+# LeGrad
+
+<div align="center">
+<img src="./assets/logo_LeGrad.png" width="20%"/>
+</div>
+
+### [An Explainability Method for Vision Transformers via Feature Formation Sensitivity](https://arxiv.org/abs/2404.03214)
+_[Walid Bousselham](http://walidbousselham.com/)<sup>1</sup>, [Angie Boggust](http://angieboggust.com/)<sup>2</sup>, [Sofian Chaybouti](https://scholar.google.com/citations?user=8tewdk4AAAAJ&hl)<sup>1</sup>, [Hendrik Strobelt](http://hendrik.strobelt.com/)<sup>3,4</sup> and [Hilde Kuehne](https://hildekuehne.github.io/)<sup>1,3</sup>_
+
+<sup>1</sup> University of Bonn & Goethe University Frankfurt,
+<sup>2</sup> MIT CSAIL,
+<sup>3</sup> MIT-IBM Watson AI Lab,
+<sup>4</sup> IBM Research.
+
+[![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/WalidBouss/LeGrad) 
+<a href="https://arxiv.org/abs/2404.03214"><img src="https://img.shields.io/badge/arXiv-Paper-<color>"></a>
+<a href="https://walidbousselham.com/LeGrad"><img src="https://img.shields.io/badge/Project-Website-red"></a>
+
+Vision-Language foundation models have shown remarkable performance in various zero-shot settings such as image retrieval, classification, or captioning.
+ we propose LeGrad, an explainability method specifically designed for ViTs. 
+We LeGrad we explore how the decision-making process of such models by leveraging their feature formation process.
+A by-product of understanding VL models decision-making is the ability to produce localised heatmap for any text prompt. 
+
+The following is the code for a wrapper around the [OpenCLIP](https://github.com/mlfoundations/open_clip) library to equip VL models with LeGrad.
+
+<div align="center">
+<img src="./assets/teaser_figure.png" width="100%"/>
+</div>
+
+## :hammer: Installation
+`legrad` library can be simply installed via pip: 
+```bash
+$ pip install legrad_torch
+```
+
+## Demo
+- Try out our web demo on [HuggingFace Spaces](https://huggingface.co/spaces) [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/WalidBouss/LeGrad)
+- Run the demo on Google Colab: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1ooB4AB9NRRe6Z-VilZizFOlFpTiKQHAc?usp=sharing)
+- Run [`playground.py`](./playground.py) for a usage example.
+
+To run the gradio app locally, first install gradio and then run [`app.py`](./app.py):
+```bash
+$ pip install gradio
+$ python app.py
+```
+## Usage
+To see which pretrained models is available use the following code snippet:
+```python
+import legrad
+legrad.list_pretrained()
+```
+
+### Single Image
+To process an image and a text prompt use the following code snippet:
+
+**Note**: the wrapper does not affect the original model, hence all the functionalities of OpenCLIP models can be used seamlessly.
+```python
+import requests
+from PIL import Image
+import open_clip
+import torch
+
+from legrad import LeWrapper, LePreprocess
+from legrad.utils import visualize
+
+# ------- model's paramters -------
+model_name = 'ViT-B-16'
+pretrained = 'laion2b_s34b_b88k'
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# ------- init model -------
+model, _, preprocess = open_clip.create_model_and_transforms(
+    model_name=model_name, pretrained=pretrained, device=device)
+tokenizer = open_clip.get_tokenizer(model_name=model_name)
+model.eval()
+# ------- Equip the model with LeGrad -------
+model = LeWrapper(model)
+# ___ (Optional): Wrapper for Higher-Res input image ___
+preprocess = LePreprocess(preprocess=preprocess, image_size=448)
+
+# ------- init inputs: image + text -------
+url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+image = preprocess(Image.open(requests.get(url, stream=True).raw)).unsqueeze(0).to(device)
+text = tokenizer(['a photo of a cat']).to(device)
+
+# -------
+text_embedding = model.encode_text(text, normalize=True)
+print(image.shape)
+explainability_map = model.compute_legrad_clip(image=image, text_embedding=text_embedding)
+
+# ___ (Optional): Visualize overlay of the image + heatmap ___
+visualize(heatmaps=explainability_map, image=image)
+```
+
+
+
+# :star: Acknowledgement
+This code is build as wrapper around [OpenCLIP](https://github.com/mlfoundations/open_clip) library from [LAION](https://laion.ai/), visit their repo for more vision-language models.
+This project also takes inspiration from [Transformer-MM-Explainability](https://github.com/hila-chefer/Transformer-MM-Explainability) and the [timm library](https://github.com/huggingface/pytorch-image-models), please visit their repository.
+
+# :books: Citation
+If you find this repository useful, please consider citing our work :pencil: and giving a star :star2: :
+```
+@article{bousselham2024legrad,
+  author    = {Bousselham, Walid and Boggust, Angie and Chaybouti, Sofian and Strobelt, Hendrik and Kuehne, Hilde}
+  title     = {LeGrad: An Explainability Method for Vision Transformers via Feature Formation Sensitivity},
+  journal   = {arXiv preprint arXiv:2404.03214},
+  year      = {2024},
+}
+```
diff --git a/LeGrad/app.py b/LeGrad/app.py
@@ -0,0 +1,120 @@
+import requests
+import numpy as np
+import cv2 as cv2
+from PIL import Image
+
+import torch
+import torch.nn.functional as F
+import open_clip
+
+import gradio as gr
+
+from legrad import LeWrapper, LePreprocess
+
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+layer_index = -2  # will run on cpu
+image_size = 448
+# ---------- Init CLIP Model ----------
+model_name = 'ViT-B-16'
+pretrained = 'laion2b_s34b_b88k'
+patch_size = 16
+
+model, _, preprocess = open_clip.create_model_and_transforms(model_name, pretrained=pretrained, device=device)
+tokenizer = open_clip.get_tokenizer(model_name)
+
+# ---------- Apply LeGrad's wrappers ----------
+model = LeWrapper(model)
+preprocess = LePreprocess(preprocess=preprocess, image_size=image_size)
+
+
+# ---------- Function to load image from URL ----------
+def change_to_url(url):
+    img_pil = Image.open(requests.get(url, stream=True).raw).convert('RGB')
+    return img_pil
+
+
+def _get_text_embedding(model, tokenizer, classes: list, device):
+    prompts = [f'a photo of a {cls}.' for cls in classes]
+
+    tokenized_prompts = tokenizer(prompts).to(device)
+
+    text_embedding = model.encode_text(tokenized_prompts)
+    text_embedding = F.normalize(text_embedding, dim=-1)
+    return text_embedding.unsqueeze(0)
+
+# ---------- Function to convert logits to heatmaps ----------
+def logits_to_heatmaps(logits, image_cv):
+    logits = logits[0, 0].detach().cpu().numpy()
+    logits = (logits * 255).astype('uint8')
+    heat_map = cv2.applyColorMap(logits, cv2.COLORMAP_JET)
+    viz = 0.4 * image_cv + 0.6 * heat_map
+    viz = cv2.cvtColor(viz.astype('uint8'), cv2.COLOR_BGR2RGB)
+    return viz
+
+
+# ---------- Main visualization function ----------
+def viz_func(url, image, text_query):
+    image_torch = preprocess(image).unsqueeze(0).to(device)
+    text_emb = _get_text_embedding(model, tokenizer, classes=[text_query], device=device)
+
+    # ------- Get LeGrad output -------
+    logits_legrad = model.compute_legrad(image=image_torch, text_embedding=text_emb)
+    # ------- Get Heatmpas -------
+    image_cv = cv2.cvtColor(np.array(image.resize((image_size, image_size))), cv2.COLOR_RGB2BGR)
+
+    viz_legrad = logits_to_heatmaps(logits=logits_legrad, image_cv=image_cv)
+    return viz_legrad
+
+inputs = [
+    gr.Textbox(label="Paste the url to the  selected image"),
+    gr.Image(type="pil", interactive=True, label='Select An Image'),
+    gr.Textbox(label="Text query"),
+    ]
+
+
+with gr.Blocks(css="#gradio-app-title { text-align: center; }") as demo:
+    gr.Markdown(
+        """
+        # **LeGrad: An Explainability Method for Vision Transformers via Feature Formation Sensitivity**
+        ### This demo that showcases LeGrad method to visualize the important regions in an image that correspond to a given text query.
+        The model used is OpenCLIP-ViT-B-16 (weights: `laion2b_s34b_b88k`)
+        """
+    )
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown('# Select An Image')
+            selected_image = gr.Image(type="pil", interactive=True, label='')
+            gr.Markdown('## Paste the url to the  selected image')
+            url_query = gr.Textbox(label="")
+            gr.Markdown('# Create your Own query')
+            text_query = gr.Textbox(label='')
+            run_button = gr.Button(icon='https://cdn-icons-png.flaticon.com/512/3348/3348036.png')
+
+            inputs[0].change(fn=change_to_url, outputs=inputs[1], inputs=inputs[0])
+            gr.Markdown('## LeGrad Explanation')
+            le_grad_output = gr.Image(label='LeGrad')
+
+            run_button.click(fn=viz_func,
+                inputs=[url_query, selected_image, text_query],
+                outputs=[le_grad_output])
+
+        with gr.Column():
+            gr.Markdown('# Select a Premade Example')
+            gr.Examples(
+                examples=[
+                    ["gradio_app/assets/cats_remote_control.jpeg", "cat"],
+                    ["gradio_app/assets/cats_remote_control.jpeg", "remote control"],
+                    ["gradio_app/assets/la_baguette.webp", "la baguette"],
+                    ["gradio_app/assets/la_baguette.webp", "beret"],
+                    ["gradio_app/assets/pokemons.jpeg", "Pikachu"],
+                    ["gradio_app/assets/pokemons.jpeg", "Bulbasaur"],
+                    ["gradio_app/assets/pokemons.jpeg", "Charmander"],
+                    ["gradio_app/assets/pokemons.jpeg", "Pokemons"],
+                ],
+                inputs=[selected_image, text_query],
+                label=''
+            )
+
+demo.queue()
+demo.launch()
diff --git a/LeGrad/legrad/__init__.py b/LeGrad/legrad/__init__.py
@@ -0,0 +1,2 @@
+from .wrapper import LeWrapper, LePreprocess
+from .utils import *
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		from .wrapper import LeWrapper, LePreprocess
		from .utils import *