diff --git a/README.md b/README.md index fd26344a2..6e2f82bb4 100644 --- a/README.md +++ b/README.md @@ -138,6 +138,27 @@ To disable fast generation: python generate.py --samples 16000 logdir/train/2017-02-13T16-45-34/model.ckpt-80000 --fast_generation=false ``` +### Getting Good Performance on CPUs without Global Conditioning +Setting the correct number of intra-op threads and inter-op threads can greatly improve performance. For details on intra-op threads and inter-op threads, please see the [Optimize for CPU](https://www.tensorflow.org/performance/performance_guide#optimizing_for_cpu) section of the [TensorFlow Performance Guide](https://www.tensorflow.org/performance/performance_guide). The command is shown below. +``` +python generate.py --wav_out_path=generated.wav --num_intra_threads=intra_thread --num_inter_threads=inter_thread --save_every 10000 --samples 16000 logdir/train/2017-02-13T16-45-34/model.ckpt-80000 +``` +Where: + +`--num_intra_threads` specifies number of threads used to parallelize each of the Tensorflow operator. + +`--num_inter_threads` specifies number of Ternsorflow operators that can be executed in parallel. + +For example, following run command can provide good performance in CPU: +``` +python generate.py --wav_out_path=generated.wav --num_intra_threads=2 --num_inter_threads=1 --save_every 10000 --samples 16000 logdir/train/2017-02-13T16-45-34/model.ckpt-80000 +``` + +### Optional: Creating tracefile to produce Tensorflow timeline for profiling +To create a tracefile which can be used to see the Tensorflow timeline, another runtime argument `--trace_file=file.json` can be added in the above run command. After the run, traces are written in the file.json file. The full run commnad is shown below. +``` +python generate.py --wav_out_path=generated.wav --num_intra_threads=intra_thread --num_inter_threads=inter_thread --trace_file=file.json --save_every 10000 --samples 16000 logdir/train/2017-02-13T16-45-34/model.ckpt-80000 +``` ### Generating with Global Conditioning Generate from a model incorporating global conditioning as follows: ``` @@ -156,6 +177,12 @@ printed out by the train.py script at training time. `--gc_id=311` specifies the id of speaker, speaker 311, for which a sample is to be generated. +### Getting Good Performance on CPUs with Global Conditioning +The following sample command can provide good performance with global conditioning on CPU: +``` +python generate.py --samples 16000 --num_intra_threads=2 --num_inter_threads=1 --wav_out_path speaker311.wav --gc_channels=32 --gc_cardinality=377 --gc_id=311 logdir/train/2017-02-13T16-45-34/model.ckpt-80000 +``` + ## Running tests Install the test requirements diff --git a/generate.py b/generate.py index ecfd2bf6d..0a119ce65 100644 --- a/generate.py +++ b/generate.py @@ -9,6 +9,11 @@ import librosa import numpy as np import tensorflow as tf +#tracefile related +from tensorflow.python.platform import gfile +from tensorflow.python.client import timeline +#time() related +import time from wavenet import WaveNetModel, mu_law_decode, mu_law_encode, audio_reader @@ -95,6 +100,21 @@ def _ensure_positive_float(f): type=int, default=None, help='ID of category to generate, if globally conditioned.') + parser.add_argument( + '--trace_file', + type=str, + default=None, + help='Enable TensorFlow tracing and write trace to this file.') + parser.add_argument( + '--num_intra_threads', + type=int, + default=0, + help='Number of the intra_op_parallelism_threads.') + parser.add_argument( + '--num_inter_threads', + type=int, + default=0, + help='Number of the inter_op_parallelism_threads.') arguments = parser.parse_args() if arguments.gc_channels is not None: if arguments.gc_cardinality is None: @@ -139,7 +159,22 @@ def main(): with open(args.wavenet_params, 'r') as config_file: wavenet_params = json.load(config_file) - sess = tf.Session() + #tracefile related + trace_filename = args.trace_file + if trace_filename: + run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) + run_metadata = tf.RunMetadata() + else: + run_options = None + run_metadata = None + + session_config = tf.ConfigProto(allow_soft_placement=True) + #inter and intra parallelism for CPU + session_config.inter_op_parallelism_threads = args.num_inter_threads + session_config.intra_op_parallelism_threads = args.num_intra_threads + session_config.gpu_options.allow_growth = True + + sess = tf.Session(config=session_config) net = WaveNetModel( batch_size=1, @@ -206,6 +241,10 @@ def main(): print('Done.') last_sample_timestamp = datetime.now() + + current_time = time.time() + start_time = time.time() + for step in range(args.samples): if args.fast_generation: outputs = [next_sample] @@ -218,8 +257,13 @@ def main(): window = waveform outputs = [next_sample] - # Run the WaveNet to predict the next sample. - prediction = sess.run(outputs, feed_dict={samples: window})[0] + prediction = sess.run(outputs, feed_dict={samples: window}, options=run_options, run_metadata=run_metadata)[0] + + #tracefile generation related + if trace_filename: + trace = timeline.Timeline(step_stats=run_metadata.step_stats) + with gfile.Open(trace_filename, 'w') as trace_file: + trace_file.write(trace.generate_chrome_trace_format(show_memory=True)) # Scale prediction distribution using temperature. np.seterr(divide='ignore') @@ -248,6 +292,14 @@ def main(): print('Sample {:3500 : + total_time = current_time - start_time + print("Average Throughput of whole run: Samples / sec: %f" % (args.samples/total_time)) + print("Average Latency of whole run: msec / sample: %f" % (total_time/args.samples*1000)) # Save the result as an audio summary. datestring = str(datetime.now()).replace(' ', 'T')