Skip to content

Commit 4b8b436

Browse files
authored
Patch for Perf Bench (modelscope#506)
* * refine perf bench workflow * fix wrong var in sphinx docs * * refine perf bench workflow * * fix wrong var in sphinx docs * * set python version matrix to include only 3.9 and 3.10 * * hide unnecessary logs * * update mem_required for image tagging models * * enable unittests for 3 OPs due to dependency * + add two dependencies by librosa
1 parent 9f1b0c8 commit 4b8b436

File tree

9 files changed

+19
-17
lines changed

9 files changed

+19
-17
lines changed

.github/workflows/deploy_sphinx_docs.yml

+4-1
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,16 @@ on:
1212
jobs:
1313
pages:
1414
runs-on: ubuntu-20.04
15+
strategy:
16+
matrix:
17+
python-version: [ "3.9", "3.10" ]
1518
steps:
1619
- name: Checkout
1720
uses: actions/checkout@v4
1821
- name: Setup Python ${{ matrix.python-version }}
1922
uses: actions/setup-python@master
2023
with:
21-
python_version: ${{ matrix.python-version }}
24+
python-version: ${{ matrix.python-version }}
2225
- name: Install dependencies
2326
run: |
2427
python -m pip install --upgrade pip

.github/workflows/perf-bench.yml

+3-3
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@ env:
1616
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
1717

1818
jobs:
19-
unittest-single:
20-
runs-on: [self-hosted, linux]
19+
perf_bench:
20+
runs-on: [GPU, unittest]
2121
environment: Testing
2222
steps:
2323
- uses: actions/checkout@v3
@@ -42,7 +42,7 @@ jobs:
4242
- name: Run performance benchmark standalone
4343
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
4444
run: |
45-
docker compose exec ray-head python tests/benchmark_performance/run.sh ${{ secrets.INTERNAL_WANDB_URL }} ${{ secrets.INTERNAL_WANDB_API_KEY }}
45+
docker compose exec ray-head bash tests/benchmark_performance/run.sh ${{ secrets.INTERNAL_WANDB_URL }} ${{ secrets.INTERNAL_WANDB_API_KEY }}
4646
4747
- name: Remove docker compose
4848
working-directory: dj-${{ github.run_id }}/.github/workflows/docker

configs/config_all.yaml

+3
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,7 @@ process:
212212
radius: 2 # radius of blur kernel
213213
- image_tagging_mapper: # Mapper to generate image tags.
214214
tag_field_name: '__dj__image_tags__' # the field name to store the tags. It's "__dj__image_tags__" in default.
215+
mem_required: '9GB'
215216
- nlpaug_en_mapper: # simply augment texts in English based on the nlpaug library
216217
sequential: false # whether combine all augmentation methods to a sequence. If it's True, a sample will be augmented by all opened augmentation methods sequentially. If it's False, each opened augmentation method would generate its augmented samples independently.
217218
aug_num: 1 # number of augmented samples to be generated. If `sequential` is True, there will be total aug_num augmented samples generated. If it's False, there will be (aug_num * #opened_aug_method) augmented samples generated.
@@ -382,6 +383,7 @@ process:
382383
frame_sampling_method: 'all_keyframes' # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes".
383384
frame_num: 3 # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
384385
tag_field_name: '__dj__video_frame_tags__' # the field name to store the tags. It's "__dj__video_frame_tags__" in default.
386+
mem_required: '9GB'
385387
- whitespace_normalization_mapper: # normalize different kinds of whitespaces to English whitespace.
386388

387389
# Filter ops
@@ -614,6 +616,7 @@ process:
614616
frame_num: 3 # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
615617
tag_field_name: '__dj__video_frame_tags__' # the field name to store the tags. It's "__dj__video_frame_tags__" in default.
616618
any_or_all: any # keep this sample when any/all videos meet the filter condition
619+
mem_required: '9GB'
617620
- words_num_filter: # filter text with number of words out of specific range
618621
lang: en # sample in which language
619622
tokenization: false # whether to use model to tokenize documents

environments/minimal_requires.txt

+4
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,11 @@ pandas
44
numpy
55
av==13.1.0
66
soundfile
7+
# need to install two dependencies by librosa to avoid lazy_loader error
78
librosa>=0.10
9+
samplerate
10+
resampy
11+
# need to install two dependencies by librosa to avoid lazy_loader error
812
loguru
913
tabulate
1014
tqdm

tests/benchmark_performance/configs/video.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ process:
1414
score_threshold: 1.0
1515
mem_required: '1GB'
1616
- video_tagging_from_frames_mapper:
17+
mem_required: '9GB'
1718
- video_duration_filter:
1819
- video_split_by_key_frame_mapper:
1920
keep_original_sample: false

tests/benchmark_performance/run.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ MODALITIES=("text" "image" "video" "audio")
1111
cd $BENCH_PATH
1212

1313
# 1. prepare dataset
14-
wget http://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/data_juicer/perf_bench_data/perf_bench_data.tar.gz && tar zxvf perf_bench_data.tar.gz
14+
wget -q http://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/data_juicer/perf_bench_data/perf_bench_data.tar.gz && tar zxf perf_bench_data.tar.gz
1515

1616
# 2. run the benchmark
1717
for modality in ${MODALITIES[@]}

tests/ops/filter/test_audio_duration_filter.py

+1-4
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,8 @@
55

66
from data_juicer.ops.filter.audio_duration_filter import AudioDurationFilter
77
from data_juicer.utils.constant import Fields
8-
from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, TEST_TAG, SKIPPED_TESTS
8+
from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, TEST_TAG
99

10-
# skip due to conflicts when run lazy_load in multiprocessing in librosa
11-
# tests passed locally.
12-
@SKIPPED_TESTS.register_module()
1310
class AudioDurationFilterTest(DataJuicerTestCaseBase):
1411

1512
data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',

tests/ops/filter/test_audio_nmf_snr_filter.py

+1-4
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,8 @@
55

66
from data_juicer.ops.filter.audio_nmf_snr_filter import AudioNMFSNRFilter
77
from data_juicer.utils.constant import Fields
8-
from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS
8+
from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
99

10-
# skip due to conflicts when run lazy_load in multiprocessing in librosa
11-
# tests passed locally.
12-
@SKIPPED_TESTS.register_module()
1310
class AudioNMFSNRFilterTest(DataJuicerTestCaseBase):
1411

1512
data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',

tests/ops/mapper/test_video_tagging_from_audio_mapper.py

+1-4
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,8 @@
66
VideoTaggingFromAudioMapper
77
from data_juicer.utils.constant import Fields
88
from data_juicer.utils.mm_utils import SpecialTokens
9-
from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS
9+
from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
1010

11-
# skip due to conflicts when run lazy_load in multiprocessing in librosa
12-
# tests passed locally.
13-
@SKIPPED_TESTS.register_module()
1411
class VideoTaggingFromAudioMapperTest(DataJuicerTestCaseBase):
1512
data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
1613
'data')

0 commit comments

Comments
 (0)