GoogleCloudPlatform · ssvir · May 20, 2024 · May 20, 2024 · May 20, 2024 · May 20, 2024
@@ -30,6 +30,7 @@
 import com.google.cloud.speech.v1p1beta1.StreamingRecognizeResponse;
 import com.google.protobuf.ByteString;
 import com.google.protobuf.Duration;
+import java.nio.charset.StandardCharsets;
 import java.text.DecimalFormat;
 import java.util.ArrayList;
 import java.util.concurrent.BlockingQueue;
@@ -39,21 +40,17 @@
 import javax.sound.sampled.AudioSystem;
 import javax.sound.sampled.DataLine;
 import javax.sound.sampled.DataLine.Info;
+import javax.sound.sampled.LineUnavailableException;
 import javax.sound.sampled.TargetDataLine;
 
 public class InfiniteStreamRecognize {
 
-  private static final int STREAMING_LIMIT = 290000; // ~5 minutes
-
-  public static final String RED = "\033[0;31m";
-  public static final String GREEN = "\033[0;32m";
-  public static final String YELLOW = "\033[0;33m";
+  private static final int STREAMING_BATCH_LIMIT = 290000; // ~5 minutes
+  private static final String EXIT_WORD = "exit";
+  private static final int BYTES_PER_BUFFER = 6000; // buffer size in bytes
+  private static final BlockingQueue<byte[]> sharedQueue = new LinkedBlockingQueue<byte[]>();
 
   // Creating shared object
-  private static volatile BlockingQueue<byte[]> sharedQueue = new LinkedBlockingQueue<byte[]>();
-  private static TargetDataLine targetDataLine;
-  private static int BYTES_PER_BUFFER = 6400; // buffer size in bytes
-
   private static int restartCounter = 0;
   private static ArrayList<ByteString> audioInput = new ArrayList<ByteString>();
   private static ArrayList<ByteString> lastAudioInput = new ArrayList<ByteString>();
@@ -63,19 +60,36 @@ public class InfiniteStreamRecognize {
   private static boolean newStream = true;
   private static double bridgingOffset = 0;
   private static boolean lastTranscriptWasFinal = false;
+  private static boolean stopRecognition = false;
   private static StreamController referenceToStreamController;
-  private static ByteString tempByteString;
 
-  public static void main(String... args) {
+  public static void main(String... args) throws LineUnavailableException {
     InfiniteStreamRecognizeOptions options = InfiniteStreamRecognizeOptions.fromFlags(args);
     if (options == null) {
       // Could not parse.
       System.out.println("Failed to parse options.");
       System.exit(1);
     }
 
+    // TODO(developer): Replace the variables before running the sample or use default
+    // the number of samples per second
+    int sampleRate = 16000;
+    // the number of bits in each sample
+    int sampleSizeInBits = 16;
+    // the number of channels (1 for mono, 2 for stereo, and so on)
+    int channels = 1;
+    // indicates whether the data is signed or unsigned
+    boolean signed = true;
+    // indicates whether the data for a single sample is stored in big-endian byte
+    // order (false means little-endian)
+    boolean bigEndian = false;
+
+    MicBuffer micBuffer = new MicBuffer(sampleRate, sampleSizeInBits, channels, signed, bigEndian);
     try {
-      infiniteStreamingRecognize(options.langCode);
+      // Say `exit` to stop application execution
+      infiniteStreamingRecognize(options.langCode, micBuffer, sampleRate,
+              RecognitionConfig.AudioEncoding.LINEAR16);
+      System.out.println("\nThe application has been stopped.");
     } catch (Exception e) {
       System.out.println("Exception caught: " + e);
     }
@@ -94,87 +108,22 @@ public static String convertMillisToDate(double milliSeconds) {
   }
 
   /** Performs infinite streaming speech recognition */
-  public static void infiniteStreamingRecognize(String languageCode) throws Exception {
-
-    // Microphone Input buffering
-    class MicBuffer implements Runnable {
-
-      @Override
-      public void run() {
-        System.out.println(YELLOW);
-        System.out.println("Start speaking...Press Ctrl-C to stop");
-        targetDataLine.start();
-        byte[] data = new byte[BYTES_PER_BUFFER];
-        while (targetDataLine.isOpen()) {
-          try {
-            int numBytesRead = targetDataLine.read(data, 0, data.length);
-            if ((numBytesRead <= 0) && (targetDataLine.isOpen())) {
-              continue;
-            }
-            sharedQueue.put(data.clone());
-          } catch (InterruptedException e) {
-            System.out.println("Microphone input buffering interrupted : " + e.getMessage());
-          }
-        }
-      }
-    }
-
+  public static void infiniteStreamingRecognize(String languageCode, Runnable micBuffer,
+                                                int sampleRateHertz,
+                                                RecognitionConfig.AudioEncoding encoding)
+          throws Exception {
     // Creating microphone input buffer thread
-    MicBuffer micrunnable = new MicBuffer();
-    Thread micThread = new Thread(micrunnable);
-    ResponseObserver<StreamingRecognizeResponse> responseObserver = null;
+    Thread micThread = new Thread(micBuffer);
     try (SpeechClient client = SpeechClient.create()) {
-      ClientStream<StreamingRecognizeRequest> clientStream;
-      responseObserver =
-          new ResponseObserver<StreamingRecognizeResponse>() {
-
-            ArrayList<StreamingRecognizeResponse> responses = new ArrayList<>();
-
-            public void onStart(StreamController controller) {
-              referenceToStreamController = controller;
-            }
-
-            public void onResponse(StreamingRecognizeResponse response) {
-              responses.add(response);
-              StreamingRecognitionResult result = response.getResultsList().get(0);
-              Duration resultEndTime = result.getResultEndTime();
-              resultEndTimeInMS =
-                  (int)
-                      ((resultEndTime.getSeconds() * 1000) + (resultEndTime.getNanos() / 1000000));
-              double correctedTime =
-                  resultEndTimeInMS - bridgingOffset + (STREAMING_LIMIT * restartCounter);
-
-              SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0);
-              if (result.getIsFinal()) {
-                System.out.print(GREEN);
-                System.out.print("\033[2K\r");
-                System.out.printf(
-                    "%s: %s [confidence: %.2f]\n",
-                    convertMillisToDate(correctedTime),
-                    alternative.getTranscript(),
-                    alternative.getConfidence());
-                isFinalEndTime = resultEndTimeInMS;
-                lastTranscriptWasFinal = true;
-              } else {
-                System.out.print(RED);
-                System.out.print("\033[2K\r");
-                System.out.printf(
-                    "%s: %s", convertMillisToDate(correctedTime), alternative.getTranscript());
-                lastTranscriptWasFinal = false;
-              }
-            }
-
-            public void onComplete() {}
-
-            public void onError(Throwable t) {}
-          };
-      clientStream = client.streamingRecognizeCallable().splitCall(responseObserver);
+      ResponseObserver<StreamingRecognizeResponse> responseObserver = getResponseObserver();
+      ClientStream<StreamingRecognizeRequest> clientStream = client.streamingRecognizeCallable()
+              .splitCall(responseObserver);
 
       RecognitionConfig recognitionConfig =
           RecognitionConfig.newBuilder()
-              .setEncoding(RecognitionConfig.AudioEncoding.LINEAR16)
+              .setEncoding(encoding)
               .setLanguageCode(languageCode)
-              .setSampleRateHertz(16000)
+              .setSampleRateHertz(sampleRateHertz)
               .build();
 
       StreamingRecognitionConfig streamingRecognitionConfig =
@@ -191,31 +140,14 @@ public void onError(Throwable t) {}
       clientStream.send(request);
 
       try {
-        // SampleRate:16000Hz, SampleSizeInBits: 16, Number of channels: 1, Signed: true,
-        // bigEndian: false
-        AudioFormat audioFormat = new AudioFormat(16000, 16, 1, true, false);
-        DataLine.Info targetInfo =
-            new Info(
-                TargetDataLine.class,
-                audioFormat); // Set the system information to read from the microphone audio
-        // stream
-
-        if (!AudioSystem.isLineSupported(targetInfo)) {
-          System.out.println("Microphone not supported");
-          System.exit(0);
-        }
-        // Target data line captures the audio stream the microphone produces.
-        targetDataLine = (TargetDataLine) AudioSystem.getLine(targetInfo);
-        targetDataLine.open(audioFormat);
+        micThread.setDaemon(true);
         micThread.start();
 
         long startTime = System.currentTimeMillis();
-
-        while (true) {
-
+        while (!stopRecognition) {
           long estimatedTime = System.currentTimeMillis() - startTime;
 
-          if (estimatedTime >= STREAMING_LIMIT) {
+          if (estimatedTime >= STREAMING_BATCH_LIMIT) {
 
             clientStream.closeSend();
             referenceToStreamController.cancel(); // remove Observer
@@ -244,8 +176,7 @@ public void onError(Throwable t) {}
                     .setStreamingConfig(streamingRecognitionConfig)
                     .build();
 
-            System.out.println(YELLOW);
-            System.out.printf("%d: RESTARTING REQUEST\n", restartCounter * STREAMING_LIMIT);
+            System.out.printf("%d: RESTARTING REQUEST\n", restartCounter * STREAMING_BATCH_LIMIT);
 
             startTime = System.currentTimeMillis();
 
@@ -255,7 +186,7 @@ public void onError(Throwable t) {}
               // if this is the first audio from a new request
               // calculate amount of unfinalized audio from last request
               // resend the audio to the speech client before incoming audio
-              double chunkTime = STREAMING_LIMIT / lastAudioInput.size();
+              double chunkTime = STREAMING_BATCH_LIMIT / lastAudioInput.size();
               // ms length of each chunk in previous request audio arrayList
               if (chunkTime != 0) {
                 if (bridgingOffset < 0) {
@@ -283,7 +214,9 @@ public void onError(Throwable t) {}
               newStream = false;
             }
 
-            tempByteString = ByteString.copyFrom(sharedQueue.take());
+            ByteString tempByteString = ByteString.copyFrom(sharedQueue.take());
+
+            checkStopRecognitionFlag(tempByteString.toByteArray());
 
             request =
                 StreamingRecognizeRequest.newBuilder().setAudioContent(tempByteString).build();
@@ -293,10 +226,113 @@ public void onError(Throwable t) {}
 
           clientStream.send(request);
         }
+        clientStream.closeSend();
       } catch (Exception e) {
         System.out.println(e);
       }
     }
   }
+
+  public static ResponseObserver<StreamingRecognizeResponse> getResponseObserver() {
+    return new ResponseObserver<StreamingRecognizeResponse>() {
+
+      final ArrayList<StreamingRecognizeResponse> responses = new ArrayList<>();
+
+      public void onStart(StreamController controller) {
+        referenceToStreamController = controller;
+      }
+
+      public void onResponse(StreamingRecognizeResponse response) {
+        responses.add(response);
+        StreamingRecognitionResult result = response.getResultsList().get(0);
+        Duration resultEndTime = result.getResultEndTime();
+        resultEndTimeInMS =
+                (int) ((resultEndTime.getSeconds() * 1000) + (resultEndTime.getNanos() / 1000000));
+        double correctedTime =
+                resultEndTimeInMS - bridgingOffset + (STREAMING_BATCH_LIMIT * restartCounter);
+
+        SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0);
+        if (result.getIsFinal()) {
+          System.out.print("\r");
+          System.out.printf(
+                  "%s: %s [confidence: %.2f]\n",
+                  convertMillisToDate(correctedTime),
+                  alternative.getTranscript(),
+                  alternative.getConfidence());
+          isFinalEndTime = resultEndTimeInMS;
+          lastTranscriptWasFinal = true;
+        } else {
+          System.out.print("\r");
+          System.out.printf(
+                  "%s: %s", convertMillisToDate(correctedTime), alternative.getTranscript());
+          lastTranscriptWasFinal = false;
+        }
+        checkStopRecognitionFlag(alternative.getTranscript().getBytes(StandardCharsets.UTF_8));
+      }
+
+      public void onComplete() {
+        System.out.println("Recognition was stopped");
+      }
+
+      public void onError(Throwable t) {}
+    };
+  }
+
+  private static void checkStopRecognitionFlag(byte[] flag) {
+    if (flag.length <= (EXIT_WORD.length() + 2)) {
+      stopRecognition = new String(flag).trim().equalsIgnoreCase(EXIT_WORD);
+      if (stopRecognition) {
+        putDataToSharedQueue(EXIT_WORD.getBytes(StandardCharsets.UTF_8));
+      }
+    }
+  }
+
+  // Microphone Input buffering
+  static class MicBuffer implements Runnable {
+    TargetDataLine targetDataLine;
+
+    public MicBuffer(int sampleRate, int sampleSizeInBits,
+                     int channels, boolean signed, boolean bigEndian)
+            throws LineUnavailableException {
+      AudioFormat audioFormat
+              = new AudioFormat(sampleRate, sampleSizeInBits, channels, signed, bigEndian);
+      DataLine.Info targetInfo =
+              new Info(
+                      TargetDataLine.class,
+                      audioFormat); // Set the system information to read from the microphone audio
+      // stream
+
+      if (!AudioSystem.isLineSupported(targetInfo)) {
+        System.out.println("Microphone not supported");
+        System.exit(0);
+      }
+      // Target data line captures the audio stream the microphone produces.
+      targetDataLine = (TargetDataLine) AudioSystem.getLine(targetInfo);
+      targetDataLine.open(audioFormat);
+    }
+
+    @Override
+    public void run() {
+      System.out.println("Start speaking...Say `exit` to stop");
+      targetDataLine.start();
+      byte[] data = new byte[BYTES_PER_BUFFER];
+      while (targetDataLine.isOpen()) {
+        int numBytesRead = targetDataLine.read(data, 0, data.length);
+        if ((numBytesRead <= 0) && (targetDataLine.isOpen())) {
+          continue;
+        }
+        putDataToSharedQueue(data.clone());
+      }
+    }
+  }
+
+  public static void putDataToSharedQueue(byte[] data) {
+    try {
+      sharedQueue.put(data.clone());
+    } catch (InterruptedException e) {
+      System.out.printf("Can't insert data to shared queue. Caused by : %s", e.getMessage());
+      throw new RuntimeException(e);
+    }
+  }
 }
 // [END speech_transcribe_infinite_streaming]