Contrastive image-text models such as CLIP form the building blocks of many
state-of-the-art systems. While they excel at recognizing common generic
concepts, they still struggle on fine-grained entities which are rare, or even
absent from the pre-training dataset. Hence, a key ingredient to their success
has been the use of large-scale curated pre-training data aiming at expanding
the set of concepts that they can memorize during the pre-training stage. In
this work, we explore an alternative to encoding fine-grained knowledge
directly into the model's parameters: we instead train the model to retrieve
this knowledge from an external memory. Specifically, we propose to equip
existing vision-text models with the ability to refine their embedding with
cross-modal retrieved information from a memory at inference time, which
greatly improves their zero-shot predictions. Remarkably, we show that this can
be done with a light-weight, single-layer, fusion transformer on top of a
frozen CLIP. Our experiments validate that our retrieval-enhanced contrastive
(RECO) training improves CLIP performance substantially on several challenging
fine-grained tasks: for example +10.9 on Stanford Cars, +10.2 on CUB-2011 and
+7.3 on the recent OVEN benchmark.
%0 Generic
%1 iscen2023retrievalenhanced
%A Iscen, Ahmet
%A Caron, Mathilde
%A Fathi, Alireza
%A Schmid, Cordelia
%D 2023
%K llm retrieval
%T Retrieval-Enhanced Contrastive Vision-Text Models
%U http://arxiv.org/abs/2306.07196
%X Contrastive image-text models such as CLIP form the building blocks of many
state-of-the-art systems. While they excel at recognizing common generic
concepts, they still struggle on fine-grained entities which are rare, or even
absent from the pre-training dataset. Hence, a key ingredient to their success
has been the use of large-scale curated pre-training data aiming at expanding
the set of concepts that they can memorize during the pre-training stage. In
this work, we explore an alternative to encoding fine-grained knowledge
directly into the model's parameters: we instead train the model to retrieve
this knowledge from an external memory. Specifically, we propose to equip
existing vision-text models with the ability to refine their embedding with
cross-modal retrieved information from a memory at inference time, which
greatly improves their zero-shot predictions. Remarkably, we show that this can
be done with a light-weight, single-layer, fusion transformer on top of a
frozen CLIP. Our experiments validate that our retrieval-enhanced contrastive
(RECO) training improves CLIP performance substantially on several challenging
fine-grained tasks: for example +10.9 on Stanford Cars, +10.2 on CUB-2011 and
+7.3 on the recent OVEN benchmark.
@misc{iscen2023retrievalenhanced,
abstract = {Contrastive image-text models such as CLIP form the building blocks of many
state-of-the-art systems. While they excel at recognizing common generic
concepts, they still struggle on fine-grained entities which are rare, or even
absent from the pre-training dataset. Hence, a key ingredient to their success
has been the use of large-scale curated pre-training data aiming at expanding
the set of concepts that they can memorize during the pre-training stage. In
this work, we explore an alternative to encoding fine-grained knowledge
directly into the model's parameters: we instead train the model to retrieve
this knowledge from an external memory. Specifically, we propose to equip
existing vision-text models with the ability to refine their embedding with
cross-modal retrieved information from a memory at inference time, which
greatly improves their zero-shot predictions. Remarkably, we show that this can
be done with a light-weight, single-layer, fusion transformer on top of a
frozen CLIP. Our experiments validate that our retrieval-enhanced contrastive
(RECO) training improves CLIP performance substantially on several challenging
fine-grained tasks: for example +10.9 on Stanford Cars, +10.2 on CUB-2011 and
+7.3 on the recent OVEN benchmark.},
added-at = {2023-08-17T15:01:34.000+0200},
author = {Iscen, Ahmet and Caron, Mathilde and Fathi, Alireza and Schmid, Cordelia},
biburl = {https://www.bibsonomy.org/bibtex/2e3deef0a24150eb7d2522b0d9088c951/lisa-ee},
description = {Retrieval-Enhanced Contrastive Vision-Text Models},
interhash = {27fdca8ce1cecc88735adcc9053202b0},
intrahash = {e3deef0a24150eb7d2522b0d9088c951},
keywords = {llm retrieval},
note = {cite arxiv:2306.07196},
timestamp = {2023-08-17T15:01:34.000+0200},
title = {Retrieval-Enhanced Contrastive Vision-Text Models},
url = {http://arxiv.org/abs/2306.07196},
year = 2023
}