mlfoundations
diff --git a/Diff for: ‎.pre-commit-config.yaml
+16 b/Diff for: ‎.pre-commit-config.yaml
+16
diff --git a/Diff for: ‎LICENSE
+1-1 b/Diff for: ‎LICENSE
+1-1
diff --git a/Diff for: ‎README.md
+3-3 b/Diff for: ‎README.md
+3-3
diff --git a/Diff for: ‎aggregate_scores.py
+38-21 b/Diff for: ‎aggregate_scores.py
+38-21
diff --git a/Diff for: ‎baselines/apply_filter.py
+4-4 b/Diff for: ‎baselines/apply_filter.py
+4-4
diff --git a/Diff for: ‎baselines/image_based_clustering.md
+2-3 b/Diff for: ‎baselines/image_based_clustering.md
+2-3
diff --git a/Diff for: ‎baselines/image_based_clustering.py
+30-23 b/Diff for: ‎baselines/image_based_clustering.py
+30-23
@@ -0,0 +1,16 @@
+# To run locally:
+# % pre-commit run -a
+repos:
+- repo: https://github.com/pre-commit/pre-commit-hooks
+  rev: v2.3.0
+  hooks:
+  - id: check-yaml
+  - id: end-of-file-fixer
+  - id: trailing-whitespace
+- repo: local
+  hooks:
+  - id: black
+    name: black
+    entry: black
+    language: system
+    types: [python]
@@ -25,4 +25,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
+SOFTWARE.
@@ -51,7 +51,7 @@ There are four scales in our competition:
 - `xlarge`: 12.8B pool size, 12.8B examples seen
 
 
-The script will create two directories inside `$data_dir`: `metadata` and `shards`. 
+The script will create two directories inside `$data_dir`: `metadata` and `shards`.
 
 Along with the images and captions, this script will also download metadata, including `.parquet` files that contain the image urls, captions, and other potentially useful information such as the similarities between the images and captions given by trained OpenAI CLIP models.
 If the flag `--download_npz` is used, the script will also download the `.npz` files with features extracted by the trained OpenAI CLIP models for each sample.
@@ -161,7 +161,7 @@ A image clustering based method that retains samples whose images have content c
 python baselines.py --metadata_dir path/to/metadata --save_path path/to/image_based.npy --name image_based --image_based_scale small --batch_size 512
 ```
 
-**Note**: this baseline requires pre-computed image cluster centroids which will be downloaded automatically the first time you run it. 
+**Note**: this baseline requires pre-computed image cluster centroids which will be downloaded automatically the first time you run it.
 If you want to generate the centroids yourself, please see `baselines/image_based_clustering.md` for instructions.
 
 ### Intersection of image-based and CLIP score filtering
@@ -239,7 +239,7 @@ We also highly encourage participants to also upload the checkpoints for their t
 
 ## Checkpoints
 
-We release the checkpoints for our main baselines as part of [OpenCLIP](https://github.com/mlfoundations/open_clip). More details can be found at https://github.com/mlfoundations/open_clip/blob/main/docs/datacomp_models.md. 
+We release the checkpoints for our main baselines as part of [OpenCLIP](https://github.com/mlfoundations/open_clip). More details can be found at https://github.com/mlfoundations/open_clip/blob/main/docs/datacomp_models.md.
 
 ## Citation
 
 
@@ -1,51 +1,68 @@
 import argparse
-import pandas as pd
 import os
+
 import numpy as np
+import pandas as pd
 
 DATASET_GROUPS = {
-    'ImageNet dist. shifts': {
-        'ImageNet Sketch', 'ImageNet v2', 'ImageNet-A', 'ImageNet-O', 'ImageNet-R', 'ObjectNet'
+    "ImageNet dist. shifts": {
+        "ImageNet Sketch",
+        "ImageNet v2",
+        "ImageNet-A",
+        "ImageNet-O",
+        "ImageNet-R",
+        "ObjectNet",
     },
-    'VTAB': {
-        'Caltech-101', 'CIFAR-100', 'CLEVR Counts', 'CLEVR Distance', 'Describable Textures', 'EuroSAT',
-        'KITTI Vehicle Distance', 'Oxford Flowers-102', 'Oxford-IIIT Pet', 'PatchCamelyon', 'RESISC45', 
-        'SVHN', 'SUN397'},
-    'Retrieval': {'Flickr', 'MSCOCO', 'WinoGAViL'},
+    "VTAB": {
+        "Caltech-101",
+        "CIFAR-100",
+        "CLEVR Counts",
+        "CLEVR Distance",
+        "Describable Textures",
+        "EuroSAT",
+        "KITTI Vehicle Distance",
+        "Oxford Flowers-102",
+        "Oxford-IIIT Pet",
+        "PatchCamelyon",
+        "RESISC45",
+        "SVHN",
+        "SUN397",
+    },
+    "Retrieval": {"Flickr", "MSCOCO", "WinoGAViL"},
 }
 
 
 def get_aggregate_scores(results_file):
     """Returns a dictionary with aggregated scores from a results file."""
     df = pd.read_json(results_file, lines=True)
-    df = pd.concat([df.drop(['metrics'], axis=1), df['metrics'].apply(pd.Series)], axis=1)
-    df = df.dropna(subset=['main_metric'])
-    assert len(df) == 38, f'Results file has unexpected size, {len(df)}'
+    df = pd.concat(
+        [df.drop(["metrics"], axis=1), df["metrics"].apply(pd.Series)], axis=1
+    )
+    df = df.dropna(subset=["main_metric"])
+    assert len(df) == 38, f"Results file has unexpected size, {len(df)}"
     results = dict(zip(df.dataset, df.main_metric))
-    
-    aggregate_results = {
-        'ImageNet': results['ImageNet 1k']
-    }
+
+    aggregate_results = {"ImageNet": results["ImageNet 1k"]}
 
     for group, datasets in DATASET_GROUPS.items():
         score = np.mean([results[dataset] for dataset in datasets])
         aggregate_results[group] = score
-    
 
-    aggregate_results['Average'] = np.mean(list(results.values()))
+    aggregate_results["Average"] = np.mean(list(results.values()))
 
     return aggregate_results
 
 
-
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
 
-    parser.add_argument('--input', type=str, required=True, help='Path to the results file.')
+    parser.add_argument(
+        "--input", type=str, required=True, help="Path to the results file."
+    )
 
     args = parser.parse_args()
 
     scores = get_aggregate_scores(args.input)
 
     for group, score in scores.items():
-        print(f'{group}: {score:.3f}')
+        print(f"{group}: {score:.3f}")
@@ -1,8 +1,10 @@
+import multiprocessing as mp
 import os
+import time
 from functools import partial
-import multiprocessing as mp
 from multiprocessing import Pool
-from typing import Any, Set, Tuple, List, Union
+from queue import Empty
+from typing import Any, List, Set, Tuple, Union
 
 import fasttext
 import fsspec
@@ -13,8 +15,6 @@
 import torch
 from nltk.corpus import wordnet
 from tqdm import tqdm
-import time
-from queue import Empty
 
 from baselines.utils import download, worker_threadpool
 
 
@@ -1,6 +1,6 @@
 # Clustering
 
-Generates cluster centroids from the `image-based` baselines using k-means clustering. 
+Generates cluster centroids from the `image-based` baselines using k-means clustering.
 
 
 ## Installing dependencies
@@ -20,7 +20,7 @@ To run clustering for the `small` pool, run the following command:
 
 ```
 python image_based_clustering.py \
-        --metadata_dir path/to/metadata \ 
+        --metadata_dir path/to/metadata \
         --save_path path/to/output/centroids \
         --num_clusters 100000 \
         --sample_ratio -1.0 \
@@ -34,4 +34,3 @@ Explanation to several arguments:
 - `disable_caption_filtering`: whether to disable caption filtering to the dataset. Default is `False`
 
 On a machine with 8 GPUs and 26 CPUs (there are 26 parquet files for the `small` pool), the clustering process takes about 10 minutes.
-
@@ -8,7 +8,7 @@
 import multiprocessing as mp
 from functools import partial
 from multiprocessing import Pool
-from typing import Any, Tuple, List
+from typing import Any, List, Tuple
 
 import faiss
 import fasttext
@@ -18,17 +18,14 @@
 import torch
 from tqdm import tqdm
 
-from baselines.utils import random_seed, download
 from baselines.apply_filter import caption_filter
+from baselines.utils import download, random_seed
 
 torch.backends.cudnn.benchmark = True
 
 
 def train_kmeans(
-        embeddings: np.ndarray,
-        num_clusters: int,
-        num_gpus: int,
-        seed: int = 0
+    embeddings: np.ndarray, num_clusters: int, num_gpus: int, seed: int = 0
 ) -> torch.Tensor:
     """train kmeans on embeddings
 
@@ -59,7 +56,9 @@ def train_kmeans(
     if num_gpus == 1:
         index = faiss.GpuIndexFlatL2(res[0], d, flat_config[0])
     else:
-        indexes = [faiss.GpuIndexFlatL2(res[i], d, flat_config[i]) for i in range(num_gpus)]
+        indexes = [
+            faiss.GpuIndexFlatL2(res[i], d, flat_config[i]) for i in range(num_gpus)
+        ]
         index = faiss.IndexReplicas()
         for sub_index in indexes:
             index.addIndex(sub_index)
@@ -72,10 +71,10 @@ def train_kmeans(
 
 
 def load_embedding_helper(
-        fs_root: Tuple[Any, str],
-        key: str = "l14_img",
-        caption_filtering: bool = False,
-        sample_ratio: float = -1.0
+    fs_root: Tuple[Any, str],
+    key: str = "l14_img",
+    caption_filtering: bool = False,
+    sample_ratio: float = -1.0,
 ) -> np.ndarray:
     """worker function to load embeddings
 
@@ -89,8 +88,12 @@ def load_embedding_helper(
     fs, path_root = fs_root
     embed = np.load(fs.open(f"{path_root}.npz"))[key]
     if caption_filtering:
-        lang_detect_model = fasttext.load_model(download("fasttext", "~/.cache/fasttext"))
-        df = pd.read_parquet(f"{path_root}.parquet", columns=["uid", "text"], filesystem=fs)
+        lang_detect_model = fasttext.load_model(
+            download("fasttext", "~/.cache/fasttext")
+        )
+        df = pd.read_parquet(
+            f"{path_root}.parquet", columns=["uid", "text"], filesystem=fs
+        )
         mask = caption_filter(df, lang_detect_model)
         embed = embed[mask]
     if sample_ratio > 0:
@@ -101,11 +104,11 @@ def load_embedding_helper(
 
 
 def load_embedding(
-        paths: List[Tuple[Any, str]],
-        n_workers: int = 10,
-        key: str = "l14_img",
-        caption_filtering: bool = False,
-        sample_ratio: float = -1.0
+    paths: List[Tuple[Any, str]],
+    n_workers: int = 10,
+    key: str = "l14_img",
+    caption_filtering: bool = False,
+    sample_ratio: float = -1.0,
 ) -> np.ndarray:
     """worker function to load embeddings
 
@@ -128,7 +131,9 @@ def load_embedding(
     with Pool(n_workers) as pool:
         embeds = [
             res
-            for res in tqdm(pool.imap(worker, paths), total=len(paths))  # imap so that it can be reproduced
+            for res in tqdm(
+                pool.imap(worker, paths), total=len(paths)
+            )  # imap so that it can be reproduced
             if len(res) > 0
         ]
     return np.vstack(embeds)
@@ -147,10 +152,10 @@ def load_embedding(
     )
     parser.add_argument(
         "--embedding_key",
-        default='l14_img',
+        default="l14_img",
         type=str,
-        choices=['l14_img', 'b32_img'],
-        help="precomputed embeddings used for clustering"
+        choices=["l14_img", "b32_img"],
+        help="precomputed embeddings used for clustering",
     )
     parser.add_argument(
         "--sample_ratio",
@@ -202,5 +207,7 @@ def load_embedding(
 
     print(f"start clustering: num_clusters = {num_clusters}, num_gpus = {num_gpus}")
     embeddings = embeddings.astype(np.float32)
-    centroids = train_kmeans(embeddings, num_clusters, num_gpus=num_gpus, seed=args.seed)
+    centroids = train_kmeans(
+        embeddings, num_clusters, num_gpus=num_gpus, seed=args.seed
+    )
     torch.save(centroids, args.save_path, pickle_protocol=4)