diff --git a/README.md b/README.md index f32d895..6e98737 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,50 @@ +### LibriMix version for many speakers +This is a version of the LibriMix dataset repository that is meant to be used when needing mixtures of many speakers. + +This version of the dataset was used in the paper: + +[Many-Speakers Single Channel Speech Separation with Optimal Permutation Training](https://arxiv.org/abs/2104.08955) + +This is not an implementation of the method in the paper. + +### About this version of the dataset +This version is a modified version of the original LibriMix dataset. +It is used mainly to create LibriMix datasets with many speakers (more than 5), but can be used for any number of speakers. +**The original LibriMix dataset had issues making the data creation too slow or empirically stuck indefinitely for 10-15 speakers or more.** + +This version was used in our paper: + +``` +S. Dovrat, E. Nachmani, L. Wolf. Many-Speakers Single Channel Speech Separation with Optimal Permutation Training. Annual Conference of the International Speech Communication Association (INTERSPEECH), 2021. +``` + +To recreate the datasets used there, create your LibriMix datasets using this version. + +Visit this +[GitHub comparison between the two versions](https://github.com/JorisCos/LibriMix/compare/master...ShakedDovrat:master) +to see the exact changes we made in this version. + +#### Instructions +To create datasets with 2 or 3 speakers, skip this stage. + +To create datasets with more than 3 speakers, you need to create the metadata first. + +Run +[`generate_librimix_metadata.sh`](./generate_librimix_metadata.sh) +and then use the same instructions from the original LibriMix, stated bellow. + +This script by the original LibriMix requires you to download Wham! noise dataset in addition to the LibriSpeech dataset, +even though it is not used in our paper (we only use clean mixtures, without noise). + +##### Altered generate_librimix.sh +We altered +[`generate_librimix.sh`](./generate_librimix.sh) +a bit so it will only create the type of data we used in our paper: +8 Khz sampling, Min mode and clean mixtures. +We also added `n_src` as an argument to the script. + +-------------------------------------------- + ### About the dataset LibriMix is an open source dataset for source separation in noisy environments. It is derived from LibriSpeech signals (clean subset) @@ -11,7 +58,7 @@ To generate LibriMix, clone the repo and run the main script : ``` git clone https://github.com/JorisCos/LibriMix cd LibriMix -./generate_librimix.sh storage_dir +./generate_librimix.sh n_src storage_dir ``` Make sure that SoX is installed on your machine. diff --git a/download_librispeech_and_wham.sh b/download_librispeech_and_wham.sh new file mode 100755 index 0000000..77fb913 --- /dev/null +++ b/download_librispeech_and_wham.sh @@ -0,0 +1,65 @@ +#!/bin/bash +set -eu # Exit on error + +storage_dir=$1 +librispeech_dir=$storage_dir/LibriSpeech +wham_dir=$storage_dir/wham_noise +librimix_outdir=$storage_dir/ + +function LibriSpeech_dev_clean() { + if ! test -e $librispeech_dir/dev-clean; then + echo "Download LibriSpeech/dev-clean into $storage_dir" + # If downloading stalls for more than 20s, relaunch from previous state. + wget -c --tries=0 --read-timeout=20 http://www.openslr.org/resources/12/dev-clean.tar.gz -P $storage_dir + tar -xzf $storage_dir/dev-clean.tar.gz -C $storage_dir + rm -rf $storage_dir/dev-clean.tar.gz + fi +} + +function LibriSpeech_test_clean() { + if ! test -e $librispeech_dir/test-clean; then + echo "Download LibriSpeech/test-clean into $storage_dir" + # If downloading stalls for more than 20s, relaunch from previous state. + wget -c --tries=0 --read-timeout=20 http://www.openslr.org/resources/12/test-clean.tar.gz -P $storage_dir + tar -xzf $storage_dir/test-clean.tar.gz -C $storage_dir + rm -rf $storage_dir/test-clean.tar.gz + fi +} + +function LibriSpeech_clean100() { + if ! test -e $librispeech_dir/train-clean-100; then + echo "Download LibriSpeech/train-clean-100 into $storage_dir" + # If downloading stalls for more than 20s, relaunch from previous state. + wget -c --tries=0 --read-timeout=20 http://www.openslr.org/resources/12/train-clean-100.tar.gz -P $storage_dir + tar -xzf $storage_dir/train-clean-100.tar.gz -C $storage_dir + rm -rf $storage_dir/train-clean-100.tar.gz + fi +} + +function LibriSpeech_clean360() { + if ! test -e $librispeech_dir/train-clean-360; then + echo "Download LibriSpeech/train-clean-360 into $storage_dir" + # If downloading stalls for more than 20s, relaunch from previous state. + wget -c --tries=0 --read-timeout=20 http://www.openslr.org/resources/12/train-clean-360.tar.gz -P $storage_dir + tar -xzf $storage_dir/train-clean-360.tar.gz -C $storage_dir + rm -rf $storage_dir/train-clean-360.tar.gz + fi +} + +function wham() { + if ! test -e $wham_dir; then + echo "Download wham_noise into $storage_dir" + # If downloading stalls for more than 20s, relaunch from previous state. + wget -c --tries=0 --read-timeout=20 https://storage.googleapis.com/whisper-public/wham_noise.zip -P $storage_dir + unzip -qn $storage_dir/wham_noise.zip -d $storage_dir + rm -rf $storage_dir/wham_noise.zip + fi +} + +LibriSpeech_dev_clean & +LibriSpeech_test_clean & +LibriSpeech_clean100 & +LibriSpeech_clean360 & +wham & + +wait diff --git a/generate_librimix.sh b/generate_librimix.sh old mode 100644 new mode 100755 index d4f77b7..becf9e6 --- a/generate_librimix.sh +++ b/generate_librimix.sh @@ -1,68 +1,13 @@ #!/bin/bash set -eu # Exit on error -storage_dir=$1 +n_src=$1 +storage_dir=$2 librispeech_dir=$storage_dir/LibriSpeech wham_dir=$storage_dir/wham_noise librimix_outdir=$storage_dir/ -function LibriSpeech_dev_clean() { - if ! test -e $librispeech_dir/dev-clean; then - echo "Download LibriSpeech/dev-clean into $storage_dir" - # If downloading stalls for more than 20s, relaunch from previous state. - wget -c --tries=0 --read-timeout=20 http://www.openslr.org/resources/12/dev-clean.tar.gz -P $storage_dir - tar -xzf $storage_dir/dev-clean.tar.gz -C $storage_dir - rm -rf $storage_dir/dev-clean.tar.gz - fi -} - -function LibriSpeech_test_clean() { - if ! test -e $librispeech_dir/test-clean; then - echo "Download LibriSpeech/test-clean into $storage_dir" - # If downloading stalls for more than 20s, relaunch from previous state. - wget -c --tries=0 --read-timeout=20 http://www.openslr.org/resources/12/test-clean.tar.gz -P $storage_dir - tar -xzf $storage_dir/test-clean.tar.gz -C $storage_dir - rm -rf $storage_dir/test-clean.tar.gz - fi -} - -function LibriSpeech_clean100() { - if ! test -e $librispeech_dir/train-clean-100; then - echo "Download LibriSpeech/train-clean-100 into $storage_dir" - # If downloading stalls for more than 20s, relaunch from previous state. - wget -c --tries=0 --read-timeout=20 http://www.openslr.org/resources/12/train-clean-100.tar.gz -P $storage_dir - tar -xzf $storage_dir/train-clean-100.tar.gz -C $storage_dir - rm -rf $storage_dir/train-clean-100.tar.gz - fi -} - -function LibriSpeech_clean360() { - if ! test -e $librispeech_dir/train-clean-360; then - echo "Download LibriSpeech/train-clean-360 into $storage_dir" - # If downloading stalls for more than 20s, relaunch from previous state. - wget -c --tries=0 --read-timeout=20 http://www.openslr.org/resources/12/train-clean-360.tar.gz -P $storage_dir - tar -xzf $storage_dir/train-clean-360.tar.gz -C $storage_dir - rm -rf $storage_dir/train-clean-360.tar.gz - fi -} - -function wham() { - if ! test -e $wham_dir; then - echo "Download wham_noise into $storage_dir" - # If downloading stalls for more than 20s, relaunch from previous state. - wget -c --tries=0 --read-timeout=20 https://storage.googleapis.com/whisper-public/wham_noise.zip -P $storage_dir - unzip -qn $storage_dir/wham_noise.zip -d $storage_dir - rm -rf $storage_dir/wham_noise.zip - fi -} - -LibriSpeech_dev_clean & -LibriSpeech_test_clean & -LibriSpeech_clean100 & -LibriSpeech_clean360 & -wham & - -wait +./download_librispeech_and_wham.sh $storage_dir # Path to python python_path=python @@ -70,14 +15,13 @@ python_path=python # If you wish to rerun this script in the future please comment this line out. $python_path scripts/augment_train_noise.py --wham_dir $wham_dir -for n_src in 2 3; do - metadata_dir=metadata/Libri$n_src"Mix" - $python_path scripts/create_librimix_from_metadata.py --librispeech_dir $librispeech_dir \ - --wham_dir $wham_dir \ - --metadata_dir $metadata_dir \ - --librimix_outdir $librimix_outdir \ - --n_src $n_src \ - --freqs 8k 16k \ - --modes min max \ - --types mix_clean mix_both mix_single -done + +metadata_dir=metadata/Libri$n_src"Mix" +$python_path scripts/create_librimix_from_metadata.py --librispeech_dir $librispeech_dir \ + --wham_dir $wham_dir \ + --metadata_dir $metadata_dir \ + --librimix_outdir $librimix_outdir \ + --n_src $n_src \ + --freqs 8k \ + --modes min \ + --types mix_clean diff --git a/generate_librimix_metadata.sh b/generate_librimix_metadata.sh new file mode 100755 index 0000000..033dc16 --- /dev/null +++ b/generate_librimix_metadata.sh @@ -0,0 +1,20 @@ +#!/bin/bash +set -eu # Exit on error + +n_src=$1 +storage_dir=$2 +librispeech_dir=$storage_dir/LibriSpeech +wham_dir=$storage_dir/wham_noise +metadata_dir=./metadata +librispeech_md_dir=$metadata_dir/LibriSpeech +wham_md_dir=$metadata_dir/Wham_noise +metadata_outdir=$metadata_dir/Libri$n_src"Mix" + +./download_librispeech_and_wham.sh $storage_dir + +python scripts/create_librimix_metadata.py --librispeech_dir $librispeech_dir \ + --librispeech_md_dir $librispeech_md_dir \ + --wham_dir $wham_dir \ + --wham_md_dir $wham_md_dir \ + --metadata_outdir $metadata_outdir \ + --n_src $n_src \ diff --git a/scripts/augment_train_noise.py b/scripts/augment_train_noise.py index 352c07d..3e7b713 100644 --- a/scripts/augment_train_noise.py +++ b/scripts/augment_train_noise.py @@ -54,8 +54,9 @@ def apply_fx(sound_path, speed): # Get the effect fx = (AudioEffectsChain().speed(speed)) s, rate = sf.read(sound_path) - # Get 1st channel - s = s[:, 0] + if len(s.shape) > 1: + # Get 1st channel + s = s[:, 0] # Apply effect s = fx(s) # Write the file diff --git a/scripts/create_librimix_from_metadata.py b/scripts/create_librimix_from_metadata.py index e20f19f..4b84e80 100644 --- a/scripts/create_librimix_from_metadata.py +++ b/scripts/create_librimix_from_metadata.py @@ -4,6 +4,7 @@ import pandas as pd import numpy as np import functools +import hashlib from scipy.signal import resample_poly import tqdm.contrib.concurrent @@ -137,12 +138,12 @@ def process_utterances(md_file, librispeech_dir, wham_dir, freq, mode, subdirs, [row for _, row in md_file.iterrows()], chunksize=10, ): - for mix_id, snr_list, abs_mix_path, abs_source_path_list, abs_noise_path, length, subdir in results: + for mix_id, orig_mix_id, snr_list, abs_mix_path, abs_source_path_list, abs_noise_path, length, subdir in results: # Add line to the dataframes add_to_metrics_metadata(md_dic[f"metrics_{dir_name}_{subdir}"], - mix_id, snr_list) + mix_id, orig_mix_id, snr_list) add_to_mixture_metadata(md_dic[f'mixture_{dir_name}_{subdir}'], - mix_id, abs_mix_path, abs_source_path_list, + mix_id, orig_mix_id, abs_mix_path, abs_source_path_list, abs_noise_path, length, subdir) # Save the metadata files @@ -157,6 +158,14 @@ def process_utterance(n_src, librispeech_dir, wham_dir, freq, mode, subdirs, dir # Get sources and mixture infos mix_id, gain_list, sources = read_sources(row, n_src, librispeech_dir, wham_dir) + + orig_mix_id = mix_id + # @ShakedDovrat note: Encode mix_id because it might be too long for a file name (max 255 chars on Linux). + # The original mix_id, which is a concatenation of the utterances used in the mixture, is kept in the metadata file under "original_mixture_ID". + if n_src >= 10: + hash_object = hashlib.md5(mix_id.encode()) + mix_id = hash_object.hexdigest() + # Transform sources transformed_sources = transform_sources(sources, freq, mode, gain_list) # Write the sources and get their paths @@ -186,7 +195,7 @@ def process_utterance(n_src, librispeech_dir, wham_dir, freq, mode, subdirs, dir length = len(mixture) # Compute SNR snr_list = compute_snr_list(mixture, sources_to_mix) - res.append((mix_id, snr_list, abs_mix_path, abs_source_path_list, abs_noise_path, length, subdir)) + res.append((mix_id, orig_mix_id, snr_list, abs_mix_path, abs_source_path_list, abs_noise_path, length, subdir)) return res @@ -195,6 +204,7 @@ def create_empty_metrics_md(n_src, subdir): """ Create the metrics dataframe""" metrics_dataframe = pd.DataFrame() metrics_dataframe['mixture_ID'] = {} + metrics_dataframe['original_mixture_ID'] = {} if subdir == 'mix_clean': for i in range(n_src): metrics_dataframe[f"source_{i + 1}_SNR"] = {} @@ -212,6 +222,7 @@ def create_empty_mixture_md(n_src, subdir): """ Create the mixture dataframe""" mixture_dataframe = pd.DataFrame() mixture_dataframe['mixture_ID'] = {} + mixture_dataframe['original_mixture_ID'] = {} mixture_dataframe['mixture_path'] = {} if subdir == 'mix_clean': for i in range(n_src): @@ -344,6 +355,7 @@ def write_sources(mix_id, transformed_sources, subdirs, dir_path, freq, n_src): for src, src_dir in zip(transformed_sources[:n_src], subdirs[:n_src]): save_path = os.path.join(dir_path, src_dir, ex_filename) abs_save_path = os.path.abspath(save_path) + ensure_dir(os.path.dirname(abs_save_path)) sf.write(abs_save_path, src, freq) abs_source_path_list.append(abs_save_path) return abs_source_path_list @@ -355,6 +367,7 @@ def write_noise(mix_id, transformed_sources, dir_path, freq): ex_filename = mix_id + '.wav' save_path = os.path.join(dir_path, 'noise', ex_filename) abs_save_path = os.path.abspath(save_path) + ensure_dir(os.path.dirname(abs_save_path)) sf.write(abs_save_path, noise, freq) return abs_save_path @@ -373,6 +386,7 @@ def write_mix(mix_id, mixture, dir_path, subdir, freq): ex_filename = mix_id + '.wav' save_path = os.path.join(dir_path, subdir, ex_filename) abs_save_path = os.path.abspath(save_path) + ensure_dir(os.path.dirname(abs_save_path)) sf.write(abs_save_path, mixture, freq) return abs_save_path @@ -391,13 +405,13 @@ def snr_xy(x, y): return 10 * np.log10(np.mean(x ** 2) / (np.mean(y ** 2) + EPS) + EPS) -def add_to_metrics_metadata(metrics_df, mixture_id, snr_list): +def add_to_metrics_metadata(metrics_df, mixture_id, original_mixture_id, snr_list): """ Add a new line to metrics_df""" - row_metrics = [mixture_id] + snr_list + row_metrics = [mixture_id, original_mixture_id] + snr_list metrics_df.loc[len(metrics_df)] = row_metrics -def add_to_mixture_metadata(mix_df, mix_id, abs_mix_path, abs_sources_path, +def add_to_mixture_metadata(mix_df, mix_id, orig_mix_id, abs_mix_path, abs_sources_path, abs_noise_path, length, subdir): """ Add a new line to mixture_df """ sources_path = abs_sources_path @@ -406,10 +420,15 @@ def add_to_mixture_metadata(mix_df, mix_id, abs_mix_path, abs_sources_path, noise_path = [] elif subdir == 'mix_single': sources_path = [abs_sources_path[0]] - row_mixture = [mix_id, abs_mix_path] + sources_path + noise_path + [length] + row_mixture = [mix_id, orig_mix_id, abs_mix_path] + sources_path + noise_path + [length] mix_df.loc[len(mix_df)] = row_mixture +def ensure_dir(d): + if not os.path.exists(d): + os.makedirs(d) + + if __name__ == "__main__": args = parser.parse_args() main(args) diff --git a/scripts/create_librimix_metadata.py b/scripts/create_librimix_metadata.py index 99e4a87..cdb5f09 100644 --- a/scripts/create_librimix_metadata.py +++ b/scripts/create_librimix_metadata.py @@ -37,6 +37,15 @@ help='Where librimix metadata files will be stored.') parser.add_argument('--n_src', type=int, required=True, help='Number of sources desired to create the mixture') +parser.add_argument('--run_in_parallel', type=bool, default=False, + help='@ShakedDovrat note: Run in parallel to reduce runtime, ' + 'but add randomness to the process making it non-reproducible.' + 'This will create a dataset different from the one used in the paper: ' + '"Many-Speakers Single Channel Speech Separation with Optimal Permutation Training"') +parser.add_argument('--re_use_utterances_for_train', type=bool, default=False, + help='@ShakedDovrat note: Enlarge training set by re-using training utterances. ' + 'Especially helpful when n_src is large. This will create a dataset different from the one used in the paper: ' + '"Many-Speakers Single Channel Speech Separation with Optimal Permutation Training"') def main(args): @@ -52,11 +61,11 @@ def main(args): md_dir = os.path.join(root, f'LibriMix/metadata') os.makedirs(md_dir, exist_ok=True) create_librimix_metadata(librispeech_dir, librispeech_md_dir, wham_dir, - wham_md_dir, md_dir, n_src) + wham_md_dir, md_dir, n_src, args.run_in_parallel, args.re_use_utterances_for_train) def create_librimix_metadata(librispeech_dir, librispeech_md_dir, wham_dir, - wham_md_dir, md_dir, n_src): + wham_md_dir, md_dir, n_src, run_in_parallel, re_use_utterances_for_train): """ Generate LibriMix metadata according to LibriSpeech metadata """ # Dataset name @@ -65,9 +74,12 @@ def create_librimix_metadata(librispeech_dir, librispeech_md_dir, wham_dir, librispeech_md_files = os.listdir(librispeech_md_dir) # List metadata files in wham_noise wham_md_files = os.listdir(wham_md_dir) + # If you wish to ignore some metadata files add their name here # Example : to_be_ignored = ['dev-other.csv'] to_be_ignored = [] + # @ShakedDovrat note: In our paper we didn't use train-100. Use this line to save running time: + to_be_ignored = ['train-clean-100.csv'] check_already_generated(md_dir, dataset, to_be_ignored, librispeech_md_files) @@ -75,6 +87,7 @@ def create_librimix_metadata(librispeech_dir, librispeech_md_dir, wham_dir, for librispeech_md_file in librispeech_md_files: if not librispeech_md_file.endswith('.csv'): print(f"{librispeech_md_file} is not a csv file, continue.") + librispeech_md_files.remove(librispeech_md_file) continue # Get the name of the corresponding noise md file try: @@ -84,32 +97,58 @@ def create_librimix_metadata(librispeech_dir, librispeech_md_dir, wham_dir, print('Wham metadata are missing you can either generate the ' 'missing wham files or add the librispeech metadata to ' 'to_be_ignored list') - break - - # Open .csv files from LibriSpeech - librispeech_md = pd.read_csv(os.path.join( - librispeech_md_dir, librispeech_md_file), engine='python') - # Open .csv files from wham_noise - wham_md = pd.read_csv(os.path.join( - wham_md_dir, wham_md_file), engine='python') - # Filenames - save_path = os.path.join(md_dir, - '_'.join([dataset, librispeech_md_file])) - info_name = '_'.join([dataset, librispeech_md_file.strip('.csv'), - 'info']) + '.csv' - info_save_path = os.path.join(md_dir, info_name) - print(f"Creating {os.path.basename(save_path)} file in {md_dir}") - # Create dataframe - mixtures_md, mixtures_info = create_librimix_df( - librispeech_md, librispeech_dir, wham_md, wham_dir, - n_src) - # Round number of files - mixtures_md = mixtures_md[:len(mixtures_md) // 100 * 100] - mixtures_info = mixtures_info[:len(mixtures_info) // 100 * 100] - - # Save csv files - mixtures_md.to_csv(save_path, index=False) - mixtures_info.to_csv(info_save_path, index=False) + return + + if run_in_parallel: + import warnings + import multiprocessing + warnings.warn("Running in parallel reduces running time, but might create a non-reproducible samples selection. Use with caution.") + jobs = [] + for librispeech_md_file in librispeech_md_files: + p = multiprocessing.Process(target=create_librimix_metadata_single_set, + args=(librispeech_dir, librispeech_md_dir, wham_dir, wham_md_dir, md_dir, n_src, + librispeech_md_file, re_use_utterances_for_train)) + jobs.append(p) + p.start() + [job.join() for job in jobs] # wait for all to finish + else: + for librispeech_md_file in librispeech_md_files: + create_librimix_metadata_single_set(librispeech_dir, librispeech_md_dir, wham_dir, wham_md_dir, md_dir, n_src, + librispeech_md_file, re_use_utterances_for_train) + + +def create_librimix_metadata_single_set(librispeech_dir, librispeech_md_dir, wham_dir, wham_md_dir, md_dir, n_src, + librispeech_md_file, re_use_utterances_for_train): + dataset = f'libri{n_src}mix' + wham_md_files = os.listdir(wham_md_dir) + + wham_md_file = [f for f in wham_md_files if + f.startswith(librispeech_md_file.split('-')[0])][0] + + # Open .csv files from LibriSpeech + librispeech_md = pd.read_csv(os.path.join( + librispeech_md_dir, librispeech_md_file), engine='python') + # Open .csv files from wham_noise + wham_md = pd.read_csv(os.path.join( + wham_md_dir, wham_md_file), engine='python') + # Filenames + save_path = os.path.join(md_dir, + '_'.join([dataset, librispeech_md_file])) + info_name = '_'.join([dataset, librispeech_md_file.strip('.csv'), + 'info']) + '.csv' + info_save_path = os.path.join(md_dir, info_name) + print(f"Creating {os.path.basename(save_path)} file in {md_dir}") + # Create dataframe + mixtures_md, mixtures_info = create_librimix_df( + librispeech_md, librispeech_dir, wham_md, wham_dir, + n_src, re_use_utterances_for_train) + # Round number of files + mixtures_md = mixtures_md[:len(mixtures_md) // 100 * 100] + mixtures_info = mixtures_info[:len(mixtures_info) // 100 * 100] + + # Save csv files + mixtures_md.to_csv(save_path, index=False) + mixtures_info.to_csv(info_save_path, index=False) def check_already_generated(md_dir, dataset, to_be_ignored, @@ -133,7 +172,7 @@ def check_already_generated(md_dir, dataset, to_be_ignored, def create_librimix_df(librispeech_md_file, librispeech_dir, - wham_md_file, wham_dir, n_src): + wham_md_file, wham_dir, n_src, re_use_utterances_for_train): """ Generate librimix dataframe from a LibriSpeech and wha md file""" # Create a dataframe that will be used to generate sources and mixtures @@ -149,7 +188,7 @@ def create_librimix_df(librispeech_md_file, librispeech_dir, mixtures_md["noise_path"] = {} mixtures_md["noise_gain"] = {} # Generate pairs of sources to mix - pairs, pairs_noise = set_pairs(librispeech_md_file, wham_md_file, n_src) + pairs, pairs_noise = set_pairs(librispeech_md_file, wham_md_file, n_src, re_use_utterances_for_train) clip_counter = 0 # For each combination create a new line in the dataframe for pair, pair_noise in tqdm(zip(pairs, pairs_noise), total=len(pairs)): @@ -178,83 +217,98 @@ def create_librimix_df(librispeech_md_file, librispeech_dir, return mixtures_md, mixtures_info -def set_pairs(librispeech_md_file, wham_md_file, n_src): - """ set pairs of sources to make the mixture """ +def set_pairs(librispeech_md_file, wham_md_file, n_src, re_use_utterances_for_train): + """ set "pairs" of sources to make the mixture + This function uses 'pair' and 'couple' semantics, but is used for any n_src, not just two.""" # Initialize list for pairs sources + utt_pairs = [] noise_pairs = [] # In train sets utterance are only used once - if 'train' in librispeech_md_file.iloc[0]['subset']: + is_train = 'train' in librispeech_md_file.iloc[0]['subset'] + if is_train and not re_use_utterances_for_train: utt_pairs = set_utt_pairs(librispeech_md_file, utt_pairs, n_src) noise_pairs = set_noise_pairs(utt_pairs, noise_pairs, - librispeech_md_file, wham_md_file) - # Otherwise we want 3000 mixtures + librispeech_md_file, wham_md_file, len(utt_pairs)) + # Otherwise we want 3000 or 1000 mixtures else: - while len(utt_pairs) < 3000: - utt_pairs = set_utt_pairs(librispeech_md_file, utt_pairs, n_src) - noise_pairs = set_noise_pairs(utt_pairs, noise_pairs, - librispeech_md_file, wham_md_file) + # @ShakedDovrat note: Decrease #samples to 1000 for n_src >= 10, a convention used in our paper, that started in + # "Towards listening to 10 people simultaneously: An efficient permutation invariant training of audio source separation using sinkhorn’s algorithm". + # This was done to decrease running time of this data creation process, but can be discarded in future research after run time improvements + # I made to the process. We kept it to be comparable to the Sinkhorn paper. + target_num_samples = 3000 if n_src < 10 else 1000 + if is_train: + target_num_samples = 20000 + while len(utt_pairs) < target_num_samples: + # @ShakedDovrat note: A bug was fixed here, where all utt were added instead of just the new ones of each iteration. + new_utt_pairs = set_utt_pairs(librispeech_md_file, [], n_src) + new_noise_pairs = set_noise_pairs(new_utt_pairs, [], + librispeech_md_file, wham_md_file, len(utt_pairs) + len(new_utt_pairs)) + utt_pairs += new_utt_pairs + noise_pairs += new_noise_pairs utt_pairs, noise_pairs = remove_duplicates(utt_pairs, noise_pairs) - utt_pairs = utt_pairs[:3000] - noise_pairs = noise_pairs[:3000] + utt_pairs = utt_pairs[:target_num_samples] + noise_pairs = noise_pairs[:target_num_samples] return utt_pairs, noise_pairs def set_utt_pairs(librispeech_md_file, pair_list, n_src): - # A counter - c = 0 + # @ShakedDovrat note: This function was modified to reduce runtime. + # This function uses 'pair' and 'couple' semantics, but is used for any n_src, not just two. + # Index of the rows in the metadata file - index = list(range(len(librispeech_md_file))) - - # Try to create pairs with different speakers end after 200 fails - while len(index) >= n_src and c < 200: - couple = random.sample(index, n_src) - # Check that speakers are different - speaker_list = set([librispeech_md_file.iloc[couple[i]]['speaker_ID'] - for i in range(n_src)]) - # If there are duplicates then increment the counter - if len(speaker_list) != n_src: - c += 1 - # Else append the combination to pair_list and erase the combination - # from the available indexes - else: - for i in range(n_src): - index.remove(couple[i]) + index = set(range(len(librispeech_md_file))) + + while len(index) >= n_src: # While we still have rows to use + num_failed_trials = 0 + couple = [] + # Try to create pairs with different speakers end after 200 fails + while len(couple) < n_src and num_failed_trials < 200: + speaker_list = set() + samples = random.sample(index, min(len(index), 10 * n_src)) # Heuristic - draw 10*n_src samples, in order to hopefully find n_src unique speakers. + found = False + for sample in samples: + speaker_id = librispeech_md_file.iloc[sample]['speaker_ID'] + if speaker_id not in speaker_list: # We only add speakers not yet used in this mixture + speaker_list.add(speaker_id) + couple.append(sample) + index.remove(sample) + found = True + if len(couple) == n_src: + break + if not found: + num_failed_trials += 1 + if len(couple) == n_src: pair_list.append(couple) - c = 0 + return pair_list -def set_noise_pairs(pairs, noise_pairs, librispeech_md_file, wham_md_file): +def set_noise_pairs(pairs, noise_pairs, librispeech_md_file, wham_md_file, total_num_of_pairs): print('Generating pairs') + is_train = 'train' in librispeech_md_file.iloc[0]['subset'] # Initially take not augmented data md = wham_md_file[wham_md_file['augmented'] == False] # If there are more mixtures than noises then use augmented data - if len(pairs) > len(md): + if total_num_of_pairs > len(md): md = wham_md_file # Copy pairs because we are going to remove elements from pairs for pair in pairs.copy(): - # get sources infos - sources = [librispeech_md_file.iloc[pair[i]] - for i in range(len(pair))] # get max_length - length_list = [source['length'] for source in sources] - max_length = max(length_list) + max_length = max(librispeech_md_file.iloc[elem]['length'] for elem in pair) # Ideal choices are noises longer than max_length possible = md[md['length'] >= max_length] - # if possible is not empty - try: + if not possible.empty: # random noise longer than max_length pair_noise = random.sample(list(possible.index), 1) # add that noise's index to the list noise_pairs.append(pair_noise) # remove that noise from the remaining noises md = md.drop(pair_noise) - # if possible is empty - except ValueError: + else: # if we deal with training files - if 'train' in librispeech_md_file.iloc[0]['subset']: + if is_train: # take the longest noise remaining pair_noise = list(md.index)[-1] # add it to noise list @@ -270,15 +324,19 @@ def set_noise_pairs(pairs, noise_pairs, librispeech_md_file, wham_md_file): def remove_duplicates(utt_pairs, noise_pairs): + # @ShakedDovrat note: This function was modified to reduce runtime. print('Removing duplicates') - # look for identical mixtures O(n²) - for i, (pair, pair_noise) in enumerate(zip(utt_pairs, noise_pairs)): - for j, (du_pair, du_pair_noise) in enumerate( - zip(utt_pairs, noise_pairs)): - # sort because [s1,s2] = [s2,s1] - if sorted(pair) == sorted(du_pair) and i != j: - utt_pairs.remove(du_pair) - noise_pairs.remove(du_pair_noise) + utt_pairs_sorted = [sorted(pair) for pair in utt_pairs] + indices_to_remove = set() + for i in range(len(utt_pairs_sorted)): + for j in range(i+1, len(utt_pairs_sorted)): + if utt_pairs_sorted[i] == utt_pairs_sorted[j]: + indices_to_remove.add(j) + + if len(indices_to_remove) > 0: + utt_pairs = [item for i, item in enumerate(utt_pairs) if i not in indices_to_remove] + noise_pairs = [item for i, item in enumerate(noise_pairs) if i not in indices_to_remove] + return utt_pairs, noise_pairs