intel · ynakaga · Mar 9, 2025
diff --git a/audio_processor/audio_processor/1089-134686-0001.wav b/audio_processor/audio_processor/1089-134686-0001.wav
diff --git a/audio_processor/audio_processor/__init__.py b/audio_processor/audio_processor/__init__.py
diff --git a/audio_processor/audio_processor/audio_processor_node.py b/audio_processor/audio_processor/audio_processor_node.py
@@ -0,0 +1,83 @@
+import rclpy
+from rclpy.node import Node
+from std_msgs.msg import String
+import ffmpeg
+import numpy as np
+from pydub import AudioSegment
+from openvino.runtime import Core
+
+class AudioProcessorNode(Node):
+    def __init__(self):
+        super().__init__('audio_processor_node')
+        self.publisher_ = self.create_publisher(String, 'stt_output', 10)
+        self.ie = Core()
+        # Load the converted OpenVINO model
+        # self.model = self.ie.read_model(model='wav2vec2-base/wav2vec2-base.xml')
+        self.model = self.ie.read_model(model='/root/ros2_ws/audio_processor/audio_processor/wav2vec2-base/wav2vec2-base.xml')
+        self.compiled_model = self.ie.compile_model(model=self.model, device_name='CPU')
+        self.input_layer = self.compiled_model.input(0)
+        self.output_layer = self.compiled_model.output(0)
+
+    def process_audio_file(self, file_path):
+        if file_path.endswith('.mp4'):
+            audio_data = self.extract_audio_from_mp4(file_path)
+        elif file_path.endswith('.wav'):
+            audio_data = self.read_wav_file(file_path)
+        else:
+            self.get_logger().error('Unsupported file format')
+            return
+
+        self.process_audio(audio_data)
+
+    def extract_audio_from_mp4(self, file_path):
+        audio_output = 'temp_audio.wav'
+        ffmpeg.input(file_path).output(audio_output, ac=1, ar='16000').run(overwrite_output=True)
+        return self.read_wav_file(audio_output)
+
+    def read_wav_file(self, file_path):
+        audio = AudioSegment.from_wav(file_path)
+        samples = np.array(audio.get_array_of_samples())
+        return samples
+
+    def process_audio(self, audio_data):
+        # Preprocess audio_data as needed for your model
+        input_data = self.preprocess_audio(audio_data)
+        result = self.compiled_model([input_data])[self.output_layer]
+        text_output = self.postprocess_result(result)
+        self.publish_text(text_output)
+
+    def preprocess_audio(self, audio_data):
+        # Normalize audio data
+        audio_data = audio_data / np.max(np.abs(audio_data))
+
+        # Resample or trim/pad the audio data to 16000 samples
+        target_length = 16000
+        if len(audio_data) > target_length:
+            audio_data = audio_data[:target_length]  # Trim
+        else:
+            audio_data = np.pad(audio_data, (0, max(0, target_length - len(audio_data))), 'constant')  # Pad
+
+        return np.expand_dims(audio_data, axis=0)  # Add batch dimension
+
+    def postprocess_result(self, result):
+        # Implement postprocessing logic to convert model output to text
+        return "example text"
+
+    def publish_text(self, text):
+        msg = String()
+        msg.data = text
+        self.publisher_.publish(msg)
+        self.get_logger().info(f'Published: {text}')
+
+def main(args=None):
+    rclpy.init(args=args)
+    node = AudioProcessorNode()
+    # Example: Process an audio file
+    node.process_audio_file('/root/ros2_ws/audio_processor/audio_processor/1089-134686-0001.wav')
+    # node.process_audio_file('1089-134686-0001.wav')
+    rclpy.spin(node)
+    node.destroy_node()
+    rclpy.shutdown()
+
+if __name__ == '__main__':
+    main()
diff --git a/audio_processor/audio_processor/wav2vec2-base/wav2vec2-base.xml b/audio_processor/audio_processor/wav2vec2-base/wav2vec2-base.xml
diff --git a/audio_processor/package.xml b/audio_processor/package.xml
@@ -0,0 +1,28 @@
+<?xml version="1.0"?>
+<?xml-model href="http://download.ros.org/schema/package_format3.xsd" schematypens="http://www.w3.org/2001/XMLSchema"?>
+<package format="3">
+  <name>audio_processor</name>
+  <version>0.0.0</version>
+  <description>Audio processing node for STT using OpenVINO</description>
+  <maintainer email="yuki.nakagawa@intel.com">root</maintainer>
+  <license>Apache-2.0</license>
+
+  <test_depend>ament_copyright</test_depend>
+  <test_depend>ament_flake8</test_depend>
+  <test_depend>ament_pep257</test_depend>
+  <test_depend>python3-pytest</test_depend>
+
+  <buildtool_depend>ament_cmake</buildtool_depend>
+
+  <depend>rclpy</depend>
+  <depend>std_msgs</depend>
+  <exec_depend>ffmpeg-python</exec_depend>
+  <exec_depend>pydub</exec_depend>
+  <exec_depend>numpy</exec_depend>
+  <exec_depend>openvino</exec_depend>
+
+  <export>
+    <build_type>ament_python</build_type>
+  </export>
+</package>
+
diff --git a/audio_processor/resource/audio_processor b/audio_processor/resource/audio_processor
diff --git a/audio_processor/setup.cfg b/audio_processor/setup.cfg
@@ -0,0 +1,4 @@
+[develop]
+script_dir=$base/lib/audio_processor
+[install]
+install_scripts=$base/lib/audio_processor
diff --git a/audio_processor/setup.py b/audio_processor/setup.py
@@ -0,0 +1,27 @@
+from setuptools import find_packages, setup
+
+package_name = 'audio_processor'
+
+setup(
+    name=package_name,
+    version='0.0.0',
+    packages=find_packages(exclude=['test']),
+    data_files=[
+        ('share/ament_index/resource_index/packages',
+            ['resource/' + package_name]),
+        ('share/' + package_name, ['package.xml']),
+    ],
+    install_requires=['setuptools'],
+    zip_safe=True,
+    maintainer='yuki',
+    maintainer_email='yuki.nakagawa@intel.com',
+    description='Audio processing node for STT using OpenVINO',
+    license='License declaration',
+    tests_require=['pytest'],
+    entry_points={
+        'console_scripts': [
+            'audio_processor_node = audio_processor.audio_processor_node:main',
+        ],
+    },
+)
+
diff --git a/audio_processor/test/test_copyright.py b/audio_processor/test/test_copyright.py
@@ -0,0 +1,25 @@
+# Copyright 2015 Open Source Robotics Foundation, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ament_copyright.main import main
+import pytest
+
+
+# Remove the `skip` decorator once the source file(s) have a copyright header
+@pytest.mark.skip(reason='No copyright header has been placed in the generated source file.')
+@pytest.mark.copyright
+@pytest.mark.linter
+def test_copyright():
+    rc = main(argv=['.', 'test'])
+    assert rc == 0, 'Found errors'
diff --git a/audio_processor/test/test_flake8.py b/audio_processor/test/test_flake8.py
@@ -0,0 +1,25 @@
+# Copyright 2017 Open Source Robotics Foundation, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ament_flake8.main import main_with_errors
+import pytest
+
+
+@pytest.mark.flake8
+@pytest.mark.linter
+def test_flake8():
+    rc, errors = main_with_errors(argv=[])
+    assert rc == 0, \
+        'Found %d code style errors / warnings:\n' % len(errors) + \
+        '\n'.join(errors)
diff --git a/audio_processor/test/test_pep257.py b/audio_processor/test/test_pep257.py
@@ -0,0 +1,23 @@
+# Copyright 2015 Open Source Robotics Foundation, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ament_pep257.main import main
+import pytest
+
+
+@pytest.mark.linter
+@pytest.mark.pep257
+def test_pep257():
+    rc = main(argv=['.', 'test'])
+    assert rc == 0, 'Found code style errors / warnings'