diff --git a/README.md b/README.md
index f32d895..6e98737 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,50 @@
+### LibriMix version for many speakers
+This is a version of the LibriMix dataset repository that is meant to be used when needing mixtures of many speakers.
+
+This version of the dataset was used in the paper:
+
+[Many-Speakers Single Channel Speech Separation with Optimal Permutation Training](https://arxiv.org/abs/2104.08955)
+
+This is not an implementation of the method in the paper.
+
+### About this version of the dataset
+This version is a modified version of the original LibriMix dataset. 
+It is used mainly to create LibriMix datasets with many speakers (more than 5), but can be used for any number of speakers.
+**The original LibriMix dataset had issues making the data creation too slow or empirically stuck indefinitely for 10-15 speakers or more.** 
+
+This version was used in our paper:
+
+```
+S. Dovrat, E. Nachmani, L. Wolf. Many-Speakers Single Channel Speech Separation with Optimal Permutation Training. Annual Conference of the International Speech Communication Association (INTERSPEECH), 2021.
+```
+
+To recreate the datasets used there, create your LibriMix datasets using this version.
+
+Visit this 
+[GitHub comparison between the two versions](https://github.com/JorisCos/LibriMix/compare/master...ShakedDovrat:master)
+to see the exact changes we made in this version.
+
+#### Instructions
+To create datasets with 2 or 3 speakers, skip this stage.
+
+To create datasets with more than 3 speakers, you need to create the metadata first.
+
+Run
+[`generate_librimix_metadata.sh`](./generate_librimix_metadata.sh) 
+and then use the same instructions from the original LibriMix, stated bellow.
+
+This script by the original LibriMix requires you to download Wham! noise dataset in addition to the LibriSpeech dataset, 
+even though it is not used in our paper (we only use clean mixtures, without noise).
+
+##### Altered generate_librimix.sh
+We altered 
+[`generate_librimix.sh`](./generate_librimix.sh)
+a bit so it will only create the type of data we used in our paper:
+8 Khz sampling, Min mode and clean mixtures. 
+We also added `n_src` as an argument to the script.
+
+--------------------------------------------
+
 ### About the dataset
 LibriMix is an open source dataset for source separation in noisy 
 environments. It is derived from LibriSpeech signals (clean subset) 
@@ -11,7 +58,7 @@ To generate LibriMix, clone the repo and run the main script :
 ```
 git clone https://github.com/JorisCos/LibriMix
 cd LibriMix 
-./generate_librimix.sh storage_dir
+./generate_librimix.sh n_src storage_dir
 ```
 
 Make sure that SoX is installed on your machine.
diff --git a/download_librispeech_and_wham.sh b/download_librispeech_and_wham.sh
new file mode 100755
index 0000000..77fb913
--- /dev/null
+++ b/download_librispeech_and_wham.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+set -eu  # Exit on error
+
+storage_dir=$1
+librispeech_dir=$storage_dir/LibriSpeech
+wham_dir=$storage_dir/wham_noise
+librimix_outdir=$storage_dir/
+
+function LibriSpeech_dev_clean() {
+	if ! test -e $librispeech_dir/dev-clean; then
+		echo "Download LibriSpeech/dev-clean into $storage_dir"
+		# If downloading stalls for more than 20s, relaunch from previous state.
+		wget -c --tries=0 --read-timeout=20 http://www.openslr.org/resources/12/dev-clean.tar.gz -P $storage_dir
+		tar -xzf $storage_dir/dev-clean.tar.gz -C $storage_dir
+		rm -rf $storage_dir/dev-clean.tar.gz
+	fi
+}
+
+function LibriSpeech_test_clean() {
+	if ! test -e $librispeech_dir/test-clean; then
+		echo "Download LibriSpeech/test-clean into $storage_dir"
+		# If downloading stalls for more than 20s, relaunch from previous state.
+		wget -c --tries=0 --read-timeout=20 http://www.openslr.org/resources/12/test-clean.tar.gz -P $storage_dir
+		tar -xzf $storage_dir/test-clean.tar.gz -C $storage_dir
+		rm -rf $storage_dir/test-clean.tar.gz
+	fi
+}
+
+function LibriSpeech_clean100() {
+	if ! test -e $librispeech_dir/train-clean-100; then
+		echo "Download LibriSpeech/train-clean-100 into $storage_dir"
+		# If downloading stalls for more than 20s, relaunch from previous state.
+		wget -c --tries=0 --read-timeout=20 http://www.openslr.org/resources/12/train-clean-100.tar.gz -P $storage_dir
+		tar -xzf $storage_dir/train-clean-100.tar.gz -C $storage_dir
+		rm -rf $storage_dir/train-clean-100.tar.gz
+	fi
+}
+
+function LibriSpeech_clean360() {
+	if ! test -e $librispeech_dir/train-clean-360; then
+		echo "Download LibriSpeech/train-clean-360 into $storage_dir"
+		# If downloading stalls for more than 20s, relaunch from previous state.
+		wget -c --tries=0 --read-timeout=20 http://www.openslr.org/resources/12/train-clean-360.tar.gz -P $storage_dir
+		tar -xzf $storage_dir/train-clean-360.tar.gz -C $storage_dir
+		rm -rf $storage_dir/train-clean-360.tar.gz
+	fi
+}
+
+function wham() {
+	if ! test -e $wham_dir; then
+		echo "Download wham_noise into $storage_dir"
+		# If downloading stalls for more than 20s, relaunch from previous state.
+		wget -c --tries=0 --read-timeout=20 https://storage.googleapis.com/whisper-public/wham_noise.zip -P $storage_dir
+		unzip -qn $storage_dir/wham_noise.zip -d $storage_dir
+		rm -rf $storage_dir/wham_noise.zip
+	fi
+}
+
+LibriSpeech_dev_clean &
+LibriSpeech_test_clean &
+LibriSpeech_clean100 &
+LibriSpeech_clean360 &
+wham &
+
+wait
diff --git a/generate_librimix.sh b/generate_librimix.sh
old mode 100644
new mode 100755
index d4f77b7..becf9e6
--- a/generate_librimix.sh
+++ b/generate_librimix.sh
@@ -1,68 +1,13 @@
 #!/bin/bash
 set -eu  # Exit on error
 
-storage_dir=$1
+n_src=$1
+storage_dir=$2
 librispeech_dir=$storage_dir/LibriSpeech
 wham_dir=$storage_dir/wham_noise
 librimix_outdir=$storage_dir/
 
-function LibriSpeech_dev_clean() {
-	if ! test -e $librispeech_dir/dev-clean; then
-		echo "Download LibriSpeech/dev-clean into $storage_dir"
-		# If downloading stalls for more than 20s, relaunch from previous state.
-		wget -c --tries=0 --read-timeout=20 http://www.openslr.org/resources/12/dev-clean.tar.gz -P $storage_dir
-		tar -xzf $storage_dir/dev-clean.tar.gz -C $storage_dir
-		rm -rf $storage_dir/dev-clean.tar.gz
-	fi
-}
-
-function LibriSpeech_test_clean() {
-	if ! test -e $librispeech_dir/test-clean; then
-		echo "Download LibriSpeech/test-clean into $storage_dir"
-		# If downloading stalls for more than 20s, relaunch from previous state.
-		wget -c --tries=0 --read-timeout=20 http://www.openslr.org/resources/12/test-clean.tar.gz -P $storage_dir
-		tar -xzf $storage_dir/test-clean.tar.gz -C $storage_dir
-		rm -rf $storage_dir/test-clean.tar.gz
-	fi
-}
-
-function LibriSpeech_clean100() {
-	if ! test -e $librispeech_dir/train-clean-100; then
-		echo "Download LibriSpeech/train-clean-100 into $storage_dir"
-		# If downloading stalls for more than 20s, relaunch from previous state.
-		wget -c --tries=0 --read-timeout=20 http://www.openslr.org/resources/12/train-clean-100.tar.gz -P $storage_dir
-		tar -xzf $storage_dir/train-clean-100.tar.gz -C $storage_dir
-		rm -rf $storage_dir/train-clean-100.tar.gz
-	fi
-}
-
-function LibriSpeech_clean360() {
-	if ! test -e $librispeech_dir/train-clean-360; then
-		echo "Download LibriSpeech/train-clean-360 into $storage_dir"
-		# If downloading stalls for more than 20s, relaunch from previous state.
-		wget -c --tries=0 --read-timeout=20 http://www.openslr.org/resources/12/train-clean-360.tar.gz -P $storage_dir
-		tar -xzf $storage_dir/train-clean-360.tar.gz -C $storage_dir
-		rm -rf $storage_dir/train-clean-360.tar.gz
-	fi
-}
-
-function wham() {
-	if ! test -e $wham_dir; then
-		echo "Download wham_noise into $storage_dir"
-		# If downloading stalls for more than 20s, relaunch from previous state.
-		wget -c --tries=0 --read-timeout=20 https://storage.googleapis.com/whisper-public/wham_noise.zip -P $storage_dir
-		unzip -qn $storage_dir/wham_noise.zip -d $storage_dir
-		rm -rf $storage_dir/wham_noise.zip
-	fi
-}
-
-LibriSpeech_dev_clean &
-LibriSpeech_test_clean &
-LibriSpeech_clean100 &
-LibriSpeech_clean360 &
-wham &
-
-wait
+./download_librispeech_and_wham.sh $storage_dir
 
 # Path to python
 python_path=python
@@ -70,14 +15,13 @@ python_path=python
 # If you wish to rerun this script in the future please comment this line out.
 $python_path scripts/augment_train_noise.py --wham_dir $wham_dir
 
-for n_src in 2 3; do
-  metadata_dir=metadata/Libri$n_src"Mix"
-  $python_path scripts/create_librimix_from_metadata.py --librispeech_dir $librispeech_dir \
-    --wham_dir $wham_dir \
-    --metadata_dir $metadata_dir \
-    --librimix_outdir $librimix_outdir \
-    --n_src $n_src \
-    --freqs 8k 16k \
-    --modes min max \
-    --types mix_clean mix_both mix_single
-done
+
+metadata_dir=metadata/Libri$n_src"Mix"
+$python_path scripts/create_librimix_from_metadata.py --librispeech_dir $librispeech_dir \
+  --wham_dir $wham_dir \
+  --metadata_dir $metadata_dir \
+  --librimix_outdir $librimix_outdir \
+  --n_src $n_src \
+  --freqs 8k \
+  --modes min \
+  --types mix_clean
diff --git a/generate_librimix_metadata.sh b/generate_librimix_metadata.sh
new file mode 100755
index 0000000..033dc16
--- /dev/null
+++ b/generate_librimix_metadata.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+set -eu  # Exit on error
+
+n_src=$1
+storage_dir=$2
+librispeech_dir=$storage_dir/LibriSpeech
+wham_dir=$storage_dir/wham_noise
+metadata_dir=./metadata
+librispeech_md_dir=$metadata_dir/LibriSpeech
+wham_md_dir=$metadata_dir/Wham_noise
+metadata_outdir=$metadata_dir/Libri$n_src"Mix"
+
+./download_librispeech_and_wham.sh $storage_dir
+
+python scripts/create_librimix_metadata.py --librispeech_dir $librispeech_dir \
+  --librispeech_md_dir $librispeech_md_dir \
+  --wham_dir $wham_dir \
+  --wham_md_dir $wham_md_dir \
+  --metadata_outdir $metadata_outdir \
+  --n_src $n_src \
diff --git a/scripts/augment_train_noise.py b/scripts/augment_train_noise.py
index 352c07d..3e7b713 100644
--- a/scripts/augment_train_noise.py
+++ b/scripts/augment_train_noise.py
@@ -54,8 +54,9 @@ def apply_fx(sound_path, speed):
     # Get the effect
     fx = (AudioEffectsChain().speed(speed))
     s, rate = sf.read(sound_path)
-    # Get 1st channel
-    s = s[:, 0]
+    if len(s.shape) > 1:
+        # Get 1st channel
+        s = s[:, 0]
     # Apply effect
     s = fx(s)
     # Write the file
diff --git a/scripts/create_librimix_from_metadata.py b/scripts/create_librimix_from_metadata.py
index e20f19f..4b84e80 100644
--- a/scripts/create_librimix_from_metadata.py
+++ b/scripts/create_librimix_from_metadata.py
@@ -4,6 +4,7 @@
 import pandas as pd
 import numpy as np
 import functools
+import hashlib
 from scipy.signal import resample_poly
 import tqdm.contrib.concurrent
 
@@ -137,12 +138,12 @@ def process_utterances(md_file, librispeech_dir, wham_dir, freq, mode, subdirs,
         [row for _, row in md_file.iterrows()],
         chunksize=10,
     ):
-        for mix_id, snr_list, abs_mix_path, abs_source_path_list, abs_noise_path, length, subdir in results:
+        for mix_id, orig_mix_id, snr_list, abs_mix_path, abs_source_path_list, abs_noise_path, length, subdir in results:
             # Add line to the dataframes
             add_to_metrics_metadata(md_dic[f"metrics_{dir_name}_{subdir}"],
-                                    mix_id, snr_list)
+                                    mix_id, orig_mix_id, snr_list)
             add_to_mixture_metadata(md_dic[f'mixture_{dir_name}_{subdir}'],
-                                    mix_id, abs_mix_path, abs_source_path_list,
+                                    mix_id, orig_mix_id, abs_mix_path, abs_source_path_list,
                                     abs_noise_path, length, subdir)
 
     # Save the metadata files
@@ -157,6 +158,14 @@ def process_utterance(n_src, librispeech_dir, wham_dir, freq, mode, subdirs, dir
     # Get sources and mixture infos
     mix_id, gain_list, sources = read_sources(row, n_src, librispeech_dir,
                                               wham_dir)
+
+    orig_mix_id = mix_id
+    # @ShakedDovrat note: Encode mix_id because it might be too long for a file name (max 255 chars on Linux).
+    # The original mix_id, which is a concatenation of the utterances used in the mixture, is kept in the metadata file under "original_mixture_ID".
+    if n_src >= 10:
+        hash_object = hashlib.md5(mix_id.encode())
+        mix_id = hash_object.hexdigest()
+
     # Transform sources
     transformed_sources = transform_sources(sources, freq, mode, gain_list)
     # Write the sources and get their paths
@@ -186,7 +195,7 @@ def process_utterance(n_src, librispeech_dir, wham_dir, freq, mode, subdirs, dir
         length = len(mixture)
         # Compute SNR
         snr_list = compute_snr_list(mixture, sources_to_mix)
-        res.append((mix_id, snr_list, abs_mix_path, abs_source_path_list, abs_noise_path, length, subdir))
+        res.append((mix_id, orig_mix_id, snr_list, abs_mix_path, abs_source_path_list, abs_noise_path, length, subdir))
 
     return res
 
@@ -195,6 +204,7 @@ def create_empty_metrics_md(n_src, subdir):
     """ Create the metrics dataframe"""
     metrics_dataframe = pd.DataFrame()
     metrics_dataframe['mixture_ID'] = {}
+    metrics_dataframe['original_mixture_ID'] = {}
     if subdir == 'mix_clean':
         for i in range(n_src):
             metrics_dataframe[f"source_{i + 1}_SNR"] = {}
@@ -212,6 +222,7 @@ def create_empty_mixture_md(n_src, subdir):
     """ Create the mixture dataframe"""
     mixture_dataframe = pd.DataFrame()
     mixture_dataframe['mixture_ID'] = {}
+    mixture_dataframe['original_mixture_ID'] = {}
     mixture_dataframe['mixture_path'] = {}
     if subdir == 'mix_clean':
         for i in range(n_src):
@@ -344,6 +355,7 @@ def write_sources(mix_id, transformed_sources, subdirs, dir_path, freq, n_src):
     for src, src_dir in zip(transformed_sources[:n_src], subdirs[:n_src]):
         save_path = os.path.join(dir_path, src_dir, ex_filename)
         abs_save_path = os.path.abspath(save_path)
+        ensure_dir(os.path.dirname(abs_save_path))
         sf.write(abs_save_path, src, freq)
         abs_source_path_list.append(abs_save_path)
     return abs_source_path_list
@@ -355,6 +367,7 @@ def write_noise(mix_id, transformed_sources, dir_path, freq):
     ex_filename = mix_id + '.wav'
     save_path = os.path.join(dir_path, 'noise', ex_filename)
     abs_save_path = os.path.abspath(save_path)
+    ensure_dir(os.path.dirname(abs_save_path))
     sf.write(abs_save_path, noise, freq)
     return abs_save_path
 
@@ -373,6 +386,7 @@ def write_mix(mix_id, mixture, dir_path, subdir, freq):
     ex_filename = mix_id + '.wav'
     save_path = os.path.join(dir_path, subdir, ex_filename)
     abs_save_path = os.path.abspath(save_path)
+    ensure_dir(os.path.dirname(abs_save_path))
     sf.write(abs_save_path, mixture, freq)
     return abs_save_path
 
@@ -391,13 +405,13 @@ def snr_xy(x, y):
     return 10 * np.log10(np.mean(x ** 2) / (np.mean(y ** 2) + EPS) + EPS)
 
 
-def add_to_metrics_metadata(metrics_df, mixture_id, snr_list):
+def add_to_metrics_metadata(metrics_df, mixture_id, original_mixture_id, snr_list):
     """ Add a new line to metrics_df"""
-    row_metrics = [mixture_id] + snr_list
+    row_metrics = [mixture_id, original_mixture_id] + snr_list
     metrics_df.loc[len(metrics_df)] = row_metrics
 
 
-def add_to_mixture_metadata(mix_df, mix_id, abs_mix_path, abs_sources_path,
+def add_to_mixture_metadata(mix_df, mix_id, orig_mix_id, abs_mix_path, abs_sources_path,
                             abs_noise_path, length, subdir):
     """ Add a new line to mixture_df """
     sources_path = abs_sources_path
@@ -406,10 +420,15 @@ def add_to_mixture_metadata(mix_df, mix_id, abs_mix_path, abs_sources_path,
         noise_path = []
     elif subdir == 'mix_single':
         sources_path = [abs_sources_path[0]]
-    row_mixture = [mix_id, abs_mix_path] + sources_path + noise_path + [length]
+    row_mixture = [mix_id, orig_mix_id, abs_mix_path] + sources_path + noise_path + [length]
     mix_df.loc[len(mix_df)] = row_mixture
 
 
+def ensure_dir(d):
+    if not os.path.exists(d):
+        os.makedirs(d)
+
+
 if __name__ == "__main__":
     args = parser.parse_args()
     main(args)
diff --git a/scripts/create_librimix_metadata.py b/scripts/create_librimix_metadata.py
index 99e4a87..cdb5f09 100644
--- a/scripts/create_librimix_metadata.py
+++ b/scripts/create_librimix_metadata.py
@@ -37,6 +37,15 @@
                     help='Where librimix metadata files will be stored.')
 parser.add_argument('--n_src', type=int, required=True,
                     help='Number of sources desired to create the mixture')
+parser.add_argument('--run_in_parallel', type=bool, default=False,
+                    help='@ShakedDovrat note: Run in parallel to reduce runtime, '
+                         'but add randomness to the process making it non-reproducible.'
+                         'This will create a dataset different from the one used in the paper: '
+                         '"Many-Speakers Single Channel Speech Separation with Optimal Permutation Training"')
+parser.add_argument('--re_use_utterances_for_train', type=bool, default=False,
+                    help='@ShakedDovrat note: Enlarge training set by re-using training utterances. '
+                         'Especially helpful when n_src is large. This will create a dataset different from the one used in the paper: '
+                         '"Many-Speakers Single Channel Speech Separation with Optimal Permutation Training"')
 
 
 def main(args):
@@ -52,11 +61,11 @@ def main(args):
         md_dir = os.path.join(root, f'LibriMix/metadata')
     os.makedirs(md_dir, exist_ok=True)
     create_librimix_metadata(librispeech_dir, librispeech_md_dir, wham_dir,
-                             wham_md_dir, md_dir, n_src)
+                             wham_md_dir, md_dir, n_src, args.run_in_parallel, args.re_use_utterances_for_train)
 
 
 def create_librimix_metadata(librispeech_dir, librispeech_md_dir, wham_dir,
-                             wham_md_dir, md_dir, n_src):
+                             wham_md_dir, md_dir, n_src, run_in_parallel, re_use_utterances_for_train):
     """ Generate LibriMix metadata according to LibriSpeech metadata """
 
     # Dataset name
@@ -65,9 +74,12 @@ def create_librimix_metadata(librispeech_dir, librispeech_md_dir, wham_dir,
     librispeech_md_files = os.listdir(librispeech_md_dir)
     # List metadata files in wham_noise
     wham_md_files = os.listdir(wham_md_dir)
+
     # If you wish to ignore some metadata files add their name here
     # Example : to_be_ignored = ['dev-other.csv']
     to_be_ignored = []
+    # @ShakedDovrat note: In our paper we didn't use train-100. Use this line to save running time:
+    to_be_ignored = ['train-clean-100.csv']
 
     check_already_generated(md_dir, dataset, to_be_ignored,
                             librispeech_md_files)
@@ -75,6 +87,7 @@ def create_librimix_metadata(librispeech_dir, librispeech_md_dir, wham_dir,
     for librispeech_md_file in librispeech_md_files:
         if not librispeech_md_file.endswith('.csv'):
             print(f"{librispeech_md_file} is not a csv file, continue.")
+            librispeech_md_files.remove(librispeech_md_file)
             continue
         # Get the name of the corresponding noise md file
         try:
@@ -84,32 +97,58 @@ def create_librimix_metadata(librispeech_dir, librispeech_md_dir, wham_dir,
             print('Wham metadata are missing you can either generate the '
                   'missing wham files or add the librispeech metadata to '
                   'to_be_ignored list')
-            break
-
-        # Open .csv files from LibriSpeech
-        librispeech_md = pd.read_csv(os.path.join(
-            librispeech_md_dir, librispeech_md_file), engine='python')
-        # Open .csv files from wham_noise
-        wham_md = pd.read_csv(os.path.join(
-            wham_md_dir, wham_md_file), engine='python')
-        # Filenames
-        save_path = os.path.join(md_dir,
-                                 '_'.join([dataset, librispeech_md_file]))
-        info_name = '_'.join([dataset, librispeech_md_file.strip('.csv'),
-                              'info']) + '.csv'
-        info_save_path = os.path.join(md_dir, info_name)
-        print(f"Creating {os.path.basename(save_path)} file in {md_dir}")
-        # Create dataframe
-        mixtures_md, mixtures_info = create_librimix_df(
-            librispeech_md, librispeech_dir, wham_md, wham_dir,
-            n_src)
-        # Round number of files
-        mixtures_md = mixtures_md[:len(mixtures_md) // 100 * 100]
-        mixtures_info = mixtures_info[:len(mixtures_info) // 100 * 100]
-
-        # Save csv files
-        mixtures_md.to_csv(save_path, index=False)
-        mixtures_info.to_csv(info_save_path, index=False)
+            return
+
+    if run_in_parallel:
+        import warnings
+        import multiprocessing
+        warnings.warn("Running in parallel reduces running time, but might create a non-reproducible samples selection. Use with caution.")
+        jobs = []
+        for librispeech_md_file in librispeech_md_files:
+            p = multiprocessing.Process(target=create_librimix_metadata_single_set,
+                                        args=(librispeech_dir, librispeech_md_dir, wham_dir, wham_md_dir, md_dir, n_src,
+                                              librispeech_md_file, re_use_utterances_for_train))
+            jobs.append(p)
+            p.start()
+        [job.join() for job in jobs]  # wait for all to finish
+    else:
+        for librispeech_md_file in librispeech_md_files:
+            create_librimix_metadata_single_set(librispeech_dir, librispeech_md_dir, wham_dir, wham_md_dir, md_dir, n_src,
+                                                librispeech_md_file, re_use_utterances_for_train)
+
+
+def create_librimix_metadata_single_set(librispeech_dir, librispeech_md_dir, wham_dir, wham_md_dir, md_dir, n_src,
+                                        librispeech_md_file, re_use_utterances_for_train):
+    dataset = f'libri{n_src}mix'
+    wham_md_files = os.listdir(wham_md_dir)
+
+    wham_md_file = [f for f in wham_md_files if
+                    f.startswith(librispeech_md_file.split('-')[0])][0]
+
+    # Open .csv files from LibriSpeech
+    librispeech_md = pd.read_csv(os.path.join(
+        librispeech_md_dir, librispeech_md_file), engine='python')
+    # Open .csv files from wham_noise
+    wham_md = pd.read_csv(os.path.join(
+        wham_md_dir, wham_md_file), engine='python')
+    # Filenames
+    save_path = os.path.join(md_dir,
+                             '_'.join([dataset, librispeech_md_file]))
+    info_name = '_'.join([dataset, librispeech_md_file.strip('.csv'),
+                          'info']) + '.csv'
+    info_save_path = os.path.join(md_dir, info_name)
+    print(f"Creating {os.path.basename(save_path)} file in {md_dir}")
+    # Create dataframe
+    mixtures_md, mixtures_info = create_librimix_df(
+        librispeech_md, librispeech_dir, wham_md, wham_dir,
+        n_src, re_use_utterances_for_train)
+    # Round number of files
+    mixtures_md = mixtures_md[:len(mixtures_md) // 100 * 100]
+    mixtures_info = mixtures_info[:len(mixtures_info) // 100 * 100]
+
+    # Save csv files
+    mixtures_md.to_csv(save_path, index=False)
+    mixtures_info.to_csv(info_save_path, index=False)
 
 
 def check_already_generated(md_dir, dataset, to_be_ignored,
@@ -133,7 +172,7 @@ def check_already_generated(md_dir, dataset, to_be_ignored,
 
 
 def create_librimix_df(librispeech_md_file, librispeech_dir,
-                       wham_md_file, wham_dir, n_src):
+                       wham_md_file, wham_dir, n_src, re_use_utterances_for_train):
     """ Generate librimix dataframe from a LibriSpeech and wha md file"""
 
     # Create a dataframe that will be used to generate sources and mixtures
@@ -149,7 +188,7 @@ def create_librimix_df(librispeech_md_file, librispeech_dir,
     mixtures_md["noise_path"] = {}
     mixtures_md["noise_gain"] = {}
     # Generate pairs of sources to mix
-    pairs, pairs_noise = set_pairs(librispeech_md_file, wham_md_file, n_src)
+    pairs, pairs_noise = set_pairs(librispeech_md_file, wham_md_file, n_src, re_use_utterances_for_train)
     clip_counter = 0
     # For each combination create a new line in the dataframe
     for pair, pair_noise in tqdm(zip(pairs, pairs_noise), total=len(pairs)):
@@ -178,83 +217,98 @@ def create_librimix_df(librispeech_md_file, librispeech_dir,
     return mixtures_md, mixtures_info
 
 
-def set_pairs(librispeech_md_file, wham_md_file, n_src):
-    """ set pairs of sources to make the mixture """
+def set_pairs(librispeech_md_file, wham_md_file, n_src, re_use_utterances_for_train):
+    """ set "pairs" of sources to make the mixture
+    This function uses 'pair' and 'couple' semantics, but is used for any n_src, not just two."""
     # Initialize list for pairs sources
+
     utt_pairs = []
     noise_pairs = []
     # In train sets utterance are only used once
-    if 'train' in librispeech_md_file.iloc[0]['subset']:
+    is_train = 'train' in librispeech_md_file.iloc[0]['subset']
+    if is_train and not re_use_utterances_for_train:
         utt_pairs = set_utt_pairs(librispeech_md_file, utt_pairs, n_src)
         noise_pairs = set_noise_pairs(utt_pairs, noise_pairs,
-                                      librispeech_md_file, wham_md_file)
-    # Otherwise we want 3000 mixtures
+                                      librispeech_md_file, wham_md_file, len(utt_pairs))
+    # Otherwise we want 3000 or 1000 mixtures
     else:
-        while len(utt_pairs) < 3000:
-            utt_pairs = set_utt_pairs(librispeech_md_file, utt_pairs, n_src)
-            noise_pairs = set_noise_pairs(utt_pairs, noise_pairs,
-                                          librispeech_md_file, wham_md_file)
+        # @ShakedDovrat note: Decrease #samples to 1000 for n_src >= 10, a convention used in our paper, that started in
+        # "Towards listening to 10 people simultaneously: An efficient permutation invariant training of audio source separation using sinkhorn’s algorithm".
+        # This was done to decrease running time of this data creation process, but can be discarded in future research after run time improvements
+        # I made to the process. We kept it to be comparable to the Sinkhorn paper.
+        target_num_samples = 3000 if n_src < 10 else 1000
+        if is_train:
+            target_num_samples = 20000
+        while len(utt_pairs) < target_num_samples:
+            # @ShakedDovrat note: A bug was fixed here, where all utt were added instead of just the new ones of each iteration.
+            new_utt_pairs = set_utt_pairs(librispeech_md_file, [], n_src)
+            new_noise_pairs = set_noise_pairs(new_utt_pairs, [],
+                                              librispeech_md_file, wham_md_file, len(utt_pairs) + len(new_utt_pairs))
+            utt_pairs += new_utt_pairs
+            noise_pairs += new_noise_pairs
             utt_pairs, noise_pairs = remove_duplicates(utt_pairs, noise_pairs)
-        utt_pairs = utt_pairs[:3000]
-        noise_pairs = noise_pairs[:3000]
+        utt_pairs = utt_pairs[:target_num_samples]
+        noise_pairs = noise_pairs[:target_num_samples]
 
     return utt_pairs, noise_pairs
 
 
 def set_utt_pairs(librispeech_md_file, pair_list, n_src):
-    # A counter
-    c = 0
+    # @ShakedDovrat note: This function was modified to reduce runtime.
+    # This function uses 'pair' and 'couple' semantics, but is used for any n_src, not just two.
+
     # Index of the rows in the metadata file
-    index = list(range(len(librispeech_md_file)))
-
-    # Try to create pairs with different speakers end after 200 fails
-    while len(index) >= n_src and c < 200:
-        couple = random.sample(index, n_src)
-        # Check that speakers are different
-        speaker_list = set([librispeech_md_file.iloc[couple[i]]['speaker_ID']
-                            for i in range(n_src)])
-        # If there are duplicates then increment the counter
-        if len(speaker_list) != n_src:
-            c += 1
-        # Else append the combination to pair_list and erase the combination
-        # from the available indexes
-        else:
-            for i in range(n_src):
-                index.remove(couple[i])
+    index = set(range(len(librispeech_md_file)))
+
+    while len(index) >= n_src:  # While we still have rows to use
+        num_failed_trials = 0
+        couple = []
+        # Try to create pairs with different speakers end after 200 fails
+        while len(couple) < n_src and num_failed_trials < 200:
+            speaker_list = set()
+            samples = random.sample(index, min(len(index), 10 * n_src))  # Heuristic - draw 10*n_src samples, in order to hopefully find n_src unique speakers.
+            found = False
+            for sample in samples:
+                speaker_id = librispeech_md_file.iloc[sample]['speaker_ID']
+                if speaker_id not in speaker_list:  # We only add speakers not yet used in this mixture
+                    speaker_list.add(speaker_id)
+                    couple.append(sample)
+                    index.remove(sample)
+                    found = True
+                    if len(couple) == n_src:
+                        break
+            if not found:
+                num_failed_trials += 1
+        if len(couple) == n_src:
             pair_list.append(couple)
-            c = 0
+
     return pair_list
 
 
-def set_noise_pairs(pairs, noise_pairs, librispeech_md_file, wham_md_file):
+def set_noise_pairs(pairs, noise_pairs, librispeech_md_file, wham_md_file, total_num_of_pairs):
     print('Generating pairs')
+    is_train = 'train' in librispeech_md_file.iloc[0]['subset']
     # Initially take not augmented data
     md = wham_md_file[wham_md_file['augmented'] == False]
     # If there are more mixtures than noises then use augmented data
-    if len(pairs) > len(md):
+    if total_num_of_pairs > len(md):
         md = wham_md_file
     # Copy pairs because we are going to remove elements from pairs
     for pair in pairs.copy():
-        # get sources infos
-        sources = [librispeech_md_file.iloc[pair[i]]
-                   for i in range(len(pair))]
         # get max_length
-        length_list = [source['length'] for source in sources]
-        max_length = max(length_list)
+        max_length = max(librispeech_md_file.iloc[elem]['length'] for elem in pair)
         # Ideal choices are noises longer than max_length
         possible = md[md['length'] >= max_length]
-        # if possible is not empty
-        try:
+        if not possible.empty:
             # random noise longer than max_length
             pair_noise = random.sample(list(possible.index), 1)
             # add that noise's index to the list
             noise_pairs.append(pair_noise)
             # remove that noise from the remaining noises
             md = md.drop(pair_noise)
-        # if possible is empty
-        except ValueError:
+        else:
             # if we deal with training files
-            if 'train' in librispeech_md_file.iloc[0]['subset']:
+            if is_train:
                 # take the longest noise remaining
                 pair_noise = list(md.index)[-1]
                 # add it to noise list
@@ -270,15 +324,19 @@ def set_noise_pairs(pairs, noise_pairs, librispeech_md_file, wham_md_file):
 
 
 def remove_duplicates(utt_pairs, noise_pairs):
+    # @ShakedDovrat note: This function was modified to reduce runtime.
     print('Removing duplicates')
-    # look for identical mixtures O(n²)
-    for i, (pair, pair_noise) in enumerate(zip(utt_pairs, noise_pairs)):
-        for j, (du_pair, du_pair_noise) in enumerate(
-                zip(utt_pairs, noise_pairs)):
-            # sort because [s1,s2] = [s2,s1]
-            if sorted(pair) == sorted(du_pair) and i != j:
-                utt_pairs.remove(du_pair)
-                noise_pairs.remove(du_pair_noise)
+    utt_pairs_sorted = [sorted(pair) for pair in utt_pairs]
+    indices_to_remove = set()
+    for i in range(len(utt_pairs_sorted)):
+        for j in range(i+1, len(utt_pairs_sorted)):
+            if utt_pairs_sorted[i] == utt_pairs_sorted[j]:
+                indices_to_remove.add(j)
+
+    if len(indices_to_remove) > 0:
+        utt_pairs = [item for i, item in enumerate(utt_pairs) if i not in indices_to_remove]
+        noise_pairs = [item for i, item in enumerate(noise_pairs) if i not in indices_to_remove]
+
     return utt_pairs, noise_pairs