JorisCos · ShakedDovrat · Mar 7, 2021 · Mar 7, 2021 · Mar 7, 2021 · Mar 7, 2021
diff --git a/README.md b/README.md
@@ -1,3 +1,50 @@
+### LibriMix version for many speakers
+This is a version of the LibriMix dataset repository that is meant to be used when needing mixtures of many speakers.
+
+This version of the dataset was used in the paper:
+
+[Many-Speakers Single Channel Speech Separation with Optimal Permutation Training](https://arxiv.org/abs/2104.08955)
+
+This is not an implementation of the method in the paper.
+
+### About this version of the dataset
+This version is a modified version of the original LibriMix dataset. 
+It is used mainly to create LibriMix datasets with many speakers (more than 5), but can be used for any number of speakers.
+**The original LibriMix dataset had issues making the data creation too slow or empirically stuck indefinitely for 10-15 speakers or more.** 
+
+This version was used in our paper:
+
+```
+S. Dovrat, E. Nachmani, L. Wolf. Many-Speakers Single Channel Speech Separation with Optimal Permutation Training. Annual Conference of the International Speech Communication Association (INTERSPEECH), 2021.
+```
+
+To recreate the datasets used there, create your LibriMix datasets using this version.
+
+Visit this 
+[GitHub comparison between the two versions](https://github.com/JorisCos/LibriMix/compare/master...ShakedDovrat:master)
+to see the exact changes we made in this version.
+
+#### Instructions
+To create datasets with 2 or 3 speakers, skip this stage.
+
+To create datasets with more than 3 speakers, you need to create the metadata first.
+
+Run
+[`generate_librimix_metadata.sh`](./generate_librimix_metadata.sh) 
+and then use the same instructions from the original LibriMix, stated bellow.
+
+This script by the original LibriMix requires you to download Wham! noise dataset in addition to the LibriSpeech dataset, 
+even though it is not used in our paper (we only use clean mixtures, without noise).
+
+##### Altered generate_librimix.sh
+We altered 
+[`generate_librimix.sh`](./generate_librimix.sh)
+a bit so it will only create the type of data we used in our paper:
+8 Khz sampling, Min mode and clean mixtures. 
+We also added `n_src` as an argument to the script.
+
+--------------------------------------------
+
 ### About the dataset
 LibriMix is an open source dataset for source separation in noisy 
 environments. It is derived from LibriSpeech signals (clean subset) 
@@ -11,7 +58,7 @@ To generate LibriMix, clone the repo and run the main script :
 ```
 git clone https://github.com/JorisCos/LibriMix
 cd LibriMix 
-./generate_librimix.sh storage_dir
+./generate_librimix.sh n_src storage_dir
 ```
 
 Make sure that SoX is installed on your machine.

diff --git a/download_librispeech_and_wham.sh b/download_librispeech_and_wham.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+set -eu  # Exit on error
+
+storage_dir=$1
+librispeech_dir=$storage_dir/LibriSpeech
+wham_dir=$storage_dir/wham_noise
+librimix_outdir=$storage_dir/
+
+function LibriSpeech_dev_clean() {
+	if ! test -e $librispeech_dir/dev-clean; then
+		echo "Download LibriSpeech/dev-clean into $storage_dir"
+		# If downloading stalls for more than 20s, relaunch from previous state.
+		wget -c --tries=0 --read-timeout=20 http://www.openslr.org/resources/12/dev-clean.tar.gz -P $storage_dir
+		tar -xzf $storage_dir/dev-clean.tar.gz -C $storage_dir
+		rm -rf $storage_dir/dev-clean.tar.gz
+	fi
+}
+
+function LibriSpeech_test_clean() {
+	if ! test -e $librispeech_dir/test-clean; then
+		echo "Download LibriSpeech/test-clean into $storage_dir"
+		# If downloading stalls for more than 20s, relaunch from previous state.
+		wget -c --tries=0 --read-timeout=20 http://www.openslr.org/resources/12/test-clean.tar.gz -P $storage_dir
+		tar -xzf $storage_dir/test-clean.tar.gz -C $storage_dir
+		rm -rf $storage_dir/test-clean.tar.gz
+	fi
+}
+
+function LibriSpeech_clean100() {
+	if ! test -e $librispeech_dir/train-clean-100; then
+		echo "Download LibriSpeech/train-clean-100 into $storage_dir"
+		# If downloading stalls for more than 20s, relaunch from previous state.
+		wget -c --tries=0 --read-timeout=20 http://www.openslr.org/resources/12/train-clean-100.tar.gz -P $storage_dir
+		tar -xzf $storage_dir/train-clean-100.tar.gz -C $storage_dir
+		rm -rf $storage_dir/train-clean-100.tar.gz
+	fi
+}
+
+function LibriSpeech_clean360() {
+	if ! test -e $librispeech_dir/train-clean-360; then
+		echo "Download LibriSpeech/train-clean-360 into $storage_dir"
+		# If downloading stalls for more than 20s, relaunch from previous state.
+		wget -c --tries=0 --read-timeout=20 http://www.openslr.org/resources/12/train-clean-360.tar.gz -P $storage_dir
+		tar -xzf $storage_dir/train-clean-360.tar.gz -C $storage_dir
+		rm -rf $storage_dir/train-clean-360.tar.gz
+	fi
+}
+
+function wham() {
+	if ! test -e $wham_dir; then
+		echo "Download wham_noise into $storage_dir"
+		# If downloading stalls for more than 20s, relaunch from previous state.
+		wget -c --tries=0 --read-timeout=20 https://storage.googleapis.com/whisper-public/wham_noise.zip -P $storage_dir
+		unzip -qn $storage_dir/wham_noise.zip -d $storage_dir
+		rm -rf $storage_dir/wham_noise.zip
+	fi
+}
+
+LibriSpeech_dev_clean &
+LibriSpeech_test_clean &
+LibriSpeech_clean100 &
+LibriSpeech_clean360 &
+wham &
+
+wait
diff --git a/generate_librimix.sh b/generate_librimix.sh
@@ -1,83 +1,27 @@
 #!/bin/bash
 set -eu  # Exit on error
 
-storage_dir=$1
+n_src=$1
+storage_dir=$2
 librispeech_dir=$storage_dir/LibriSpeech
 wham_dir=$storage_dir/wham_noise
 librimix_outdir=$storage_dir/
 
-function LibriSpeech_dev_clean() {
-	if ! test -e $librispeech_dir/dev-clean; then
-		echo "Download LibriSpeech/dev-clean into $storage_dir"
-		# If downloading stalls for more than 20s, relaunch from previous state.
-		wget -c --tries=0 --read-timeout=20 http://www.openslr.org/resources/12/dev-clean.tar.gz -P $storage_dir
-		tar -xzf $storage_dir/dev-clean.tar.gz -C $storage_dir
-		rm -rf $storage_dir/dev-clean.tar.gz
-	fi
-}
-
-function LibriSpeech_test_clean() {
-	if ! test -e $librispeech_dir/test-clean; then
-		echo "Download LibriSpeech/test-clean into $storage_dir"
-		# If downloading stalls for more than 20s, relaunch from previous state.
-		wget -c --tries=0 --read-timeout=20 http://www.openslr.org/resources/12/test-clean.tar.gz -P $storage_dir
-		tar -xzf $storage_dir/test-clean.tar.gz -C $storage_dir
-		rm -rf $storage_dir/test-clean.tar.gz
-	fi
-}
-
-function LibriSpeech_clean100() {
-	if ! test -e $librispeech_dir/train-clean-100; then
-		echo "Download LibriSpeech/train-clean-100 into $storage_dir"
-		# If downloading stalls for more than 20s, relaunch from previous state.
-		wget -c --tries=0 --read-timeout=20 http://www.openslr.org/resources/12/train-clean-100.tar.gz -P $storage_dir
-		tar -xzf $storage_dir/train-clean-100.tar.gz -C $storage_dir
-		rm -rf $storage_dir/train-clean-100.tar.gz
-	fi
-}
-
-function LibriSpeech_clean360() {
-	if ! test -e $librispeech_dir/train-clean-360; then
-		echo "Download LibriSpeech/train-clean-360 into $storage_dir"
-		# If downloading stalls for more than 20s, relaunch from previous state.
-		wget -c --tries=0 --read-timeout=20 http://www.openslr.org/resources/12/train-clean-360.tar.gz -P $storage_dir
-		tar -xzf $storage_dir/train-clean-360.tar.gz -C $storage_dir
-		rm -rf $storage_dir/train-clean-360.tar.gz
-	fi
-}
-
-function wham() {
-	if ! test -e $wham_dir; then
-		echo "Download wham_noise into $storage_dir"
-		# If downloading stalls for more than 20s, relaunch from previous state.
-		wget -c --tries=0 --read-timeout=20 https://storage.googleapis.com/whisper-public/wham_noise.zip -P $storage_dir
-		unzip -qn $storage_dir/wham_noise.zip -d $storage_dir
-		rm -rf $storage_dir/wham_noise.zip
-	fi
-}
-
-LibriSpeech_dev_clean &
-LibriSpeech_test_clean &
-LibriSpeech_clean100 &
-LibriSpeech_clean360 &
-wham &
-
-wait
+./download_librispeech_and_wham.sh $storage_dir
 
 # Path to python
 python_path=python
 
 # If you wish to rerun this script in the future please comment this line out.
 $python_path scripts/augment_train_noise.py --wham_dir $wham_dir
 
-for n_src in 2 3; do
-  metadata_dir=metadata/Libri$n_src"Mix"
-  $python_path scripts/create_librimix_from_metadata.py --librispeech_dir $librispeech_dir \
-    --wham_dir $wham_dir \
-    --metadata_dir $metadata_dir \
-    --librimix_outdir $librimix_outdir \
-    --n_src $n_src \
-    --freqs 8k 16k \
-    --modes min max \
-    --types mix_clean mix_both mix_single
-done
+
+metadata_dir=metadata/Libri$n_src"Mix"
+$python_path scripts/create_librimix_from_metadata.py --librispeech_dir $librispeech_dir \
+  --wham_dir $wham_dir \
+  --metadata_dir $metadata_dir \
+  --librimix_outdir $librimix_outdir \
+  --n_src $n_src \
+  --freqs 8k \
+  --modes min \
+  --types mix_clean
diff --git a/generate_librimix_metadata.sh b/generate_librimix_metadata.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+set -eu  # Exit on error
+
+n_src=$1
+storage_dir=$2
+librispeech_dir=$storage_dir/LibriSpeech
+wham_dir=$storage_dir/wham_noise
+metadata_dir=./metadata
+librispeech_md_dir=$metadata_dir/LibriSpeech
+wham_md_dir=$metadata_dir/Wham_noise
+metadata_outdir=$metadata_dir/Libri$n_src"Mix"
+
+./download_librispeech_and_wham.sh $storage_dir
+
+python scripts/create_librimix_metadata.py --librispeech_dir $librispeech_dir \
+  --librispeech_md_dir $librispeech_md_dir \
+  --wham_dir $wham_dir \
+  --wham_md_dir $wham_md_dir \
+  --metadata_outdir $metadata_outdir \
+  --n_src $n_src \
diff --git a/scripts/augment_train_noise.py b/scripts/augment_train_noise.py
@@ -54,8 +54,9 @@ def apply_fx(sound_path, speed):
     # Get the effect
     fx = (AudioEffectsChain().speed(speed))
     s, rate = sf.read(sound_path)
-    # Get 1st channel
-    s = s[:, 0]
+    if len(s.shape) > 1:
+        # Get 1st channel
+        s = s[:, 0]
     # Apply effect
     s = fx(s)
     # Write the file

diff --git a/scripts/create_librimix_from_metadata.py b/scripts/create_librimix_from_metadata.py
@@ -4,6 +4,7 @@
 import pandas as pd
 import numpy as np
 import functools
+import hashlib
 from scipy.signal import resample_poly
 import tqdm.contrib.concurrent
 
@@ -137,12 +138,12 @@ def process_utterances(md_file, librispeech_dir, wham_dir, freq, mode, subdirs,
         [row for _, row in md_file.iterrows()],
         chunksize=10,
     ):
-        for mix_id, snr_list, abs_mix_path, abs_source_path_list, abs_noise_path, length, subdir in results:
+        for mix_id, orig_mix_id, snr_list, abs_mix_path, abs_source_path_list, abs_noise_path, length, subdir in results:
             # Add line to the dataframes
             add_to_metrics_metadata(md_dic[f"metrics_{dir_name}_{subdir}"],
-                                    mix_id, snr_list)
+                                    mix_id, orig_mix_id, snr_list)
             add_to_mixture_metadata(md_dic[f'mixture_{dir_name}_{subdir}'],
-                                    mix_id, abs_mix_path, abs_source_path_list,
+                                    mix_id, orig_mix_id, abs_mix_path, abs_source_path_list,
                                     abs_noise_path, length, subdir)
 
     # Save the metadata files
@@ -157,6 +158,14 @@ def process_utterance(n_src, librispeech_dir, wham_dir, freq, mode, subdirs, dir
     # Get sources and mixture infos
     mix_id, gain_list, sources = read_sources(row, n_src, librispeech_dir,
                                               wham_dir)
+
+    orig_mix_id = mix_id
+    # @ShakedDovrat note: Encode mix_id because it might be too long for a file name (max 255 chars on Linux).
+    # The original mix_id, which is a concatenation of the utterances used in the mixture, is kept in the metadata file under "original_mixture_ID".
+    if n_src >= 10:
+        hash_object = hashlib.md5(mix_id.encode())
+        mix_id = hash_object.hexdigest()
+
     # Transform sources
     transformed_sources = transform_sources(sources, freq, mode, gain_list)
     # Write the sources and get their paths
@@ -186,7 +195,7 @@ def process_utterance(n_src, librispeech_dir, wham_dir, freq, mode, subdirs, dir
         length = len(mixture)
         # Compute SNR
         snr_list = compute_snr_list(mixture, sources_to_mix)
-        res.append((mix_id, snr_list, abs_mix_path, abs_source_path_list, abs_noise_path, length, subdir))
+        res.append((mix_id, orig_mix_id, snr_list, abs_mix_path, abs_source_path_list, abs_noise_path, length, subdir))
 
     return res
 
@@ -195,6 +204,7 @@ def create_empty_metrics_md(n_src, subdir):
     """ Create the metrics dataframe"""
     metrics_dataframe = pd.DataFrame()
     metrics_dataframe['mixture_ID'] = {}
+    metrics_dataframe['original_mixture_ID'] = {}
     if subdir == 'mix_clean':
         for i in range(n_src):
             metrics_dataframe[f"source_{i + 1}_SNR"] = {}
@@ -212,6 +222,7 @@ def create_empty_mixture_md(n_src, subdir):
     """ Create the mixture dataframe"""
     mixture_dataframe = pd.DataFrame()
     mixture_dataframe['mixture_ID'] = {}
+    mixture_dataframe['original_mixture_ID'] = {}
     mixture_dataframe['mixture_path'] = {}
     if subdir == 'mix_clean':
         for i in range(n_src):
@@ -344,6 +355,7 @@ def write_sources(mix_id, transformed_sources, subdirs, dir_path, freq, n_src):
     for src, src_dir in zip(transformed_sources[:n_src], subdirs[:n_src]):
         save_path = os.path.join(dir_path, src_dir, ex_filename)
         abs_save_path = os.path.abspath(save_path)
+        ensure_dir(os.path.dirname(abs_save_path))
         sf.write(abs_save_path, src, freq)
         abs_source_path_list.append(abs_save_path)
     return abs_source_path_list
@@ -355,6 +367,7 @@ def write_noise(mix_id, transformed_sources, dir_path, freq):
     ex_filename = mix_id + '.wav'
     save_path = os.path.join(dir_path, 'noise', ex_filename)
     abs_save_path = os.path.abspath(save_path)
+    ensure_dir(os.path.dirname(abs_save_path))
     sf.write(abs_save_path, noise, freq)
     return abs_save_path
 
@@ -373,6 +386,7 @@ def write_mix(mix_id, mixture, dir_path, subdir, freq):
     ex_filename = mix_id + '.wav'
     save_path = os.path.join(dir_path, subdir, ex_filename)
     abs_save_path = os.path.abspath(save_path)
+    ensure_dir(os.path.dirname(abs_save_path))
     sf.write(abs_save_path, mixture, freq)
     return abs_save_path
 
@@ -391,13 +405,13 @@ def snr_xy(x, y):
     return 10 * np.log10(np.mean(x ** 2) / (np.mean(y ** 2) + EPS) + EPS)
 
 
-def add_to_metrics_metadata(metrics_df, mixture_id, snr_list):
+def add_to_metrics_metadata(metrics_df, mixture_id, original_mixture_id, snr_list):
     """ Add a new line to metrics_df"""
-    row_metrics = [mixture_id] + snr_list
+    row_metrics = [mixture_id, original_mixture_id] + snr_list
     metrics_df.loc[len(metrics_df)] = row_metrics
 
 
-def add_to_mixture_metadata(mix_df, mix_id, abs_mix_path, abs_sources_path,
+def add_to_mixture_metadata(mix_df, mix_id, orig_mix_id, abs_mix_path, abs_sources_path,
                             abs_noise_path, length, subdir):
     """ Add a new line to mixture_df """
     sources_path = abs_sources_path
@@ -406,10 +420,15 @@ def add_to_mixture_metadata(mix_df, mix_id, abs_mix_path, abs_sources_path,
         noise_path = []
     elif subdir == 'mix_single':
         sources_path = [abs_sources_path[0]]
-    row_mixture = [mix_id, abs_mix_path] + sources_path + noise_path + [length]
+    row_mixture = [mix_id, orig_mix_id, abs_mix_path] + sources_path + noise_path + [length]
     mix_df.loc[len(mix_df)] = row_mixture
 
 
+def ensure_dir(d):
+    if not os.path.exists(d):
+        os.makedirs(d)
+
+
 if __name__ == "__main__":
     args = parser.parse_args()
     main(args)