From f8f46f3c8d1806c22c8ff083792b9e30b21b3fed Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 20 Aug 2025 21:15:27 -0500 Subject: [PATCH 1/4] [ci] [c++] use 'pre-commit' to run 'cpplint', upgrade to 'cpplint' 2.0.2 --- .ci/{lint-cpp.sh => check-omp-pragmas.sh} | 7 ----- .ci/test.sh | 3 -- .pre-commit-config.yaml | 16 +++++++++++ include/LightGBM/dataset.h | 16 ++++++++--- include/LightGBM/utils/array_args.h | 28 +++++++++++++++---- include/LightGBM/utils/common.h | 19 ++++++++++--- include/LightGBM/utils/openmp_wrapper.h | 8 ++++-- include/LightGBM/utils/text_reader.h | 8 ++++-- src/application/application.cpp | 8 ++++-- src/boosting/gbdt.cpp | 16 ++++++++--- src/boosting/goss.hpp | 4 ++- src/boosting/rf.hpp | 4 ++- src/c_api.cpp | 28 ++++++++++++++----- src/io/bin.cpp | 4 ++- src/io/dataset_loader.cpp | 20 +++++++++---- src/io/metadata.cpp | 28 ++++++++++++++----- src/metric/dcg_calculator.cpp | 16 ++++++++--- src/metric/map_metric.hpp | 4 ++- src/network/linker_topo.cpp | 4 ++- src/objective/rank_objective.hpp | 12 ++++++-- src/treelearner/col_sampler.hpp | 4 ++- .../data_parallel_tree_learner.cpp | 4 ++- .../feature_parallel_tree_learner.cpp | 4 ++- src/treelearner/gpu_tree_learner.cpp | 10 ++++--- src/treelearner/serial_tree_learner.cpp | 8 ++++-- .../voting_parallel_tree_learner.cpp | 16 ++++++++--- 26 files changed, 221 insertions(+), 78 deletions(-) rename .ci/{lint-cpp.sh => check-omp-pragmas.sh} (82%) diff --git a/.ci/lint-cpp.sh b/.ci/check-omp-pragmas.sh similarity index 82% rename from .ci/lint-cpp.sh rename to .ci/check-omp-pragmas.sh index be9c71857e28..42c2e52e0450 100755 --- a/.ci/lint-cpp.sh +++ b/.ci/check-omp-pragmas.sh @@ -2,13 +2,6 @@ set -e -E -u -o pipefail -echo "running cpplint" -cpplint \ - --filter=-build/c++11,-build/include_subdir,-build/header_guard,-whitespace/line_length \ - --recursive ./src ./include ./R-package ./swig ./tests \ -|| exit 1 -echo "done running cpplint" - echo "checking that all OpenMP pragmas specify num_threads()" get_omp_pragmas_without_num_threads() { grep \ diff --git a/.ci/test.sh b/.ci/test.sh index d3cbe36e9908..b02de747ebdd 100755 --- a/.ci/test.sh +++ b/.ci/test.sh @@ -105,7 +105,6 @@ if [[ $TASK == "lint" ]]; then conda create -q -y -n "${CONDA_ENV}" \ "${CONDA_PYTHON_REQUIREMENT}" \ 'biome>=1.9.3' \ - 'cpplint>=1.6.0' \ 'matplotlib-base>=3.9.1' \ 'mypy>=1.11.1' \ 'pre-commit>=3.8.0' \ @@ -118,8 +117,6 @@ if [[ $TASK == "lint" ]]; then bash ./.ci/run-pre-commit-mypy.sh || exit 1 echo "Linting R code" Rscript ./.ci/lint-r-code.R "${BUILD_DIRECTORY}" || exit 1 - echo "Linting C++ code" - bash ./.ci/lint-cpp.sh || exit 1 echo "Linting JavaScript code" bash ./.ci/lint-js.sh || exit 1 exit 0 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 567aa92ecc72..4ec0009476f3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -22,6 +22,22 @@ repos: hooks: - id: cmakelint args: ["--linelength=120", "--filter=-convention/filename,-package/stdargs,-readability/wonkycase"] + - repo: https://github.com/cpplint/cpplint + rev: '2.0.2' + hooks: + - id: cpplint + args: + - --recursive + - --filter=-build/c++11,-build/include_subdir,-build/include_what_you_use,-build/header_guard,-whitespace/indent_namespace,-whitespace/line_length + - repo: local + hooks: + - id: check-omp-pragmas + name: check-omp-pragmas + entry: sh + args: + - .ci/check-omp-pragmas.sh + language: system + pass_filenames: false - repo: https://github.com/adrienverge/yamllint rev: v1.37.1 hooks: diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index c2a4b62296f2..b6108c0bbdea 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -554,9 +554,13 @@ class Dataset { } inline void FinishOneRow(int tid, data_size_t row_idx, const std::vector& is_feature_added) { - if (is_finish_load_) { return; } + if (is_finish_load_) { + return; + } for (auto fidx : feature_need_push_zeros_) { - if (is_feature_added[fidx]) { continue; } + if (is_feature_added[fidx]) { + continue; + } const int group = feature2group_[fidx]; const int sub_feature = feature2subfeature_[fidx]; feature_groups_[group]->PushData(tid, sub_feature, row_idx, 0.0f); @@ -587,10 +591,14 @@ class Dataset { } inline void PushOneRow(int tid, data_size_t row_idx, const std::vector>& feature_values) { - if (is_finish_load_) { return; } + if (is_finish_load_) { + return; + } std::vector is_feature_added(num_features_, false); for (auto& inner_data : feature_values) { - if (inner_data.first >= num_total_features_) { continue; } + if (inner_data.first >= num_total_features_) { + continue; + } int feature_idx = used_feature_map_[inner_data.first]; if (feature_idx >= 0) { is_feature_added[feature_idx] = true; diff --git a/include/LightGBM/utils/array_args.h b/include/LightGBM/utils/array_args.h index 3c590bad4030..a21ddab8f59d 100644 --- a/include/LightGBM/utils/array_args.h +++ b/include/LightGBM/utils/array_args.h @@ -112,17 +112,33 @@ class ArrayArgs { VAL_T v = ref[end - 1]; for (;;) { while (ref[++i] > v) {} - while (v > ref[--j]) { if (j == start) { break; } } - if (i >= j) { break; } + while (v > ref[--j]) { + if (j == start) { + break; + } + } + if (i >= j) { + break; + } std::swap(ref[i], ref[j]); - if (ref[i] == v) { p++; std::swap(ref[p], ref[i]); } - if (v == ref[j]) { q--; std::swap(ref[j], ref[q]); } + if (ref[i] == v) { + p++; + std::swap(ref[p], ref[i]); + } + if (v == ref[j]) { + q--; + std::swap(ref[j], ref[q]); + } } std::swap(ref[i], ref[end - 1]); j = i - 1; i = i + 1; - for (int k = start; k <= p; k++, j--) { std::swap(ref[k], ref[j]); } - for (int k = end - 2; k >= q; k--, i++) { std::swap(ref[i], ref[k]); } + for (int k = start; k <= p; k++, j--) { + std::swap(ref[k], ref[j]); + } + for (int k = end - 2; k >= q; k--, i++) { + std::swap(ref[i], ref[k]); + } *l = j; *r = i; } diff --git a/include/LightGBM/utils/common.h b/include/LightGBM/utils/common.h index 67bc07b0ecd5..90610ac98168 100644 --- a/include/LightGBM/utils/common.h +++ b/include/LightGBM/utils/common.h @@ -315,9 +315,18 @@ inline static const char* Atof(const char* p, double* out) { } if (expon > 308) expon = 308; // Calculate scaling factor. - while (expon >= 50) { scale *= 1E50; expon -= 50; } - while (expon >= 8) { scale *= 1E8; expon -= 8; } - while (expon > 0) { scale *= 10.0; expon -= 1; } + while (expon >= 50) { + scale *= 1E50; + expon -= 50; + } + while (expon >= 8) { + scale *= 1E8; + expon -= 8; + } + while (expon > 0) { + scale *= 10.0; + expon -= 1; + } } // Return signed and scaled floating point result. *out = sign * (frac ? (value / scale) : (value * scale)); @@ -713,7 +722,9 @@ static void ParallelSort(_RanIt _First, _RanIt _Last, _Pr _Pred, _VTRanIt*) { size_t mid = left + s; size_t right = mid + s; right = std::min(len, right); - if (mid >= right) { continue; } + if (mid >= right) { + continue; + } std::copy(_First + left, _First + mid, buf + left); std::merge(buf + left, buf + mid, _First + mid, _First + right, _First + left, _Pred); } diff --git a/include/LightGBM/utils/openmp_wrapper.h b/include/LightGBM/utils/openmp_wrapper.h index b9a8ea2982fc..689f27c9f4a5 100644 --- a/include/LightGBM/utils/openmp_wrapper.h +++ b/include/LightGBM/utils/openmp_wrapper.h @@ -62,9 +62,13 @@ class ThreadExceptionHelper { } void CaptureException() { // only catch first exception. - if (ex_ptr_ != nullptr) { return; } + if (ex_ptr_ != nullptr) { + return; + } std::unique_lock guard(lock_); - if (ex_ptr_ != nullptr) { return; } + if (ex_ptr_ != nullptr) { + return; + } ex_ptr_ = std::current_exception(); } diff --git a/include/LightGBM/utils/text_reader.h b/include/LightGBM/utils/text_reader.h index ccb25f960d05..b48174542e58 100644 --- a/include/LightGBM/utils/text_reader.h +++ b/include/LightGBM/utils/text_reader.h @@ -124,7 +124,9 @@ class TextReader { ++i; ++total_cnt; // skip end of line - while ((buffer_process[i] == '\n' || buffer_process[i] == '\r') && i < read_cnt) { ++i; } + while ((buffer_process[i] == '\n' || buffer_process[i] == '\r') && i < read_cnt) { + ++i; + } last_i = i; } else { ++i; @@ -284,7 +286,9 @@ class TextReader { ++i; ++total_cnt; // skip end of line - while ((buffer_process[i] == '\n' || buffer_process[i] == '\r') && i < read_cnt) { ++i; } + while ((buffer_process[i] == '\n' || buffer_process[i] == '\r') && i < read_cnt) { + ++i; + } last_i = i; } else { ++i; diff --git a/src/application/application.cpp b/src/application/application.cpp index 42f707f0c801..7dcb182fb981 100644 --- a/src/application/application.cpp +++ b/src/application/application.cpp @@ -121,7 +121,9 @@ void Application::LoadData() { if (config_.is_provide_training_metric) { for (auto metric_type : config_.metric) { auto metric = std::unique_ptr(Metric::CreateMetric(metric_type, config_)); - if (metric == nullptr) { continue; } + if (metric == nullptr) { + continue; + } metric->Init(train_data_->metadata(), train_data_->num_data()); train_metric_.push_back(std::move(metric)); } @@ -149,7 +151,9 @@ void Application::LoadData() { valid_metrics_.emplace_back(); for (auto metric_type : config_.metric) { auto metric = std::unique_ptr(Metric::CreateMetric(metric_type, config_)); - if (metric == nullptr) { continue; } + if (metric == nullptr) { + continue; + } metric->Init(valid_datas_.back()->metadata(), valid_datas_.back()->num_data()); valid_metrics_.back().push_back(std::move(metric)); diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 737c69072b64..5bfa8d6605d4 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -210,7 +210,9 @@ void GBDT::AddValidDataset(const Dataset* valid_data, if (early_stopping_round_ > 0) { auto num_metrics = valid_metrics.size(); - if (es_first_metric_only_) { num_metrics = 1; } + if (es_first_metric_only_) { + num_metrics = 1; + } best_iter_.emplace_back(num_metrics, 0); best_score_.emplace_back(num_metrics, kMinScore); best_msg_.emplace_back(num_metrics); @@ -452,7 +454,9 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { } void GBDT::RollbackOneIter() { - if (iter_ <= 0) { return; } + if (iter_ <= 0) { + return; + } // reset score for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { auto curr_tree = models_.size() - num_tree_per_iteration_ + cur_tree_id; @@ -588,7 +592,9 @@ std::string GBDT::OutputMetric(int iter) { msg_buf << tmp_buf.str() << '\n'; } } - if (es_first_metric_only_ && j > 0) { continue; } + if (es_first_metric_only_ && j > 0) { + continue; + } if (ret.empty() && early_stopping_round_ > 0) { auto cur_score = valid_metrics_[i][j]->factor_to_bigger_better() * test_scores.back(); if (cur_score - best_score_[i][j] > early_stopping_min_delta_) { @@ -596,7 +602,9 @@ std::string GBDT::OutputMetric(int iter) { best_iter_[i][j] = iter; meet_early_stopping_pairs.emplace_back(i, j); } else { - if (iter - best_iter_[i][j] >= early_stopping_round_) { ret = best_msg_[i][j]; } + if (iter - best_iter_[i][j] >= early_stopping_round_) { + ret = best_msg_[i][j]; + } } } } diff --git a/src/boosting/goss.hpp b/src/boosting/goss.hpp index 9d55d89ee097..a853cc3c35c6 100644 --- a/src/boosting/goss.hpp +++ b/src/boosting/goss.hpp @@ -30,7 +30,9 @@ class GOSSStrategy : public SampleStrategy { void Bagging(int iter, TreeLearner* tree_learner, score_t* gradients, score_t* hessians) override { bag_data_cnt_ = num_data_; // not subsample for first iterations - if (iter < static_cast(1.0f / config_->learning_rate)) { return; } + if (iter < static_cast(1.0f / config_->learning_rate)) { + return; + } auto left_cnt = bagging_runner_.Run( num_data_, [=](int, data_size_t cur_start, data_size_t cur_cnt, data_size_t* left, diff --git a/src/boosting/rf.hpp b/src/boosting/rf.hpp index 3eb065b30f2d..8b20b3a75310 100644 --- a/src/boosting/rf.hpp +++ b/src/boosting/rf.hpp @@ -182,7 +182,9 @@ class RF : public GBDT { } void RollbackOneIter() override { - if (iter_ <= 0) { return; } + if (iter_ <= 0) { + return; + } int cur_iter = iter_ + num_init_iteration_ - 1; // reset score for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { diff --git a/src/c_api.cpp b/src/c_api.cpp index e5372646664c..882494f35dc0 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -218,7 +218,9 @@ class Booster { for (auto metric_type : config_.metric) { auto metric = std::unique_ptr( Metric::CreateMetric(metric_type, config_)); - if (metric == nullptr) { continue; } + if (metric == nullptr) { + continue; + } metric->Init(train_data_->metadata(), train_data_->num_data()); train_metric_.push_back(std::move(metric)); } @@ -394,7 +396,9 @@ class Booster { valid_metrics_.emplace_back(); for (auto metric_type : config_.metric) { auto metric = std::unique_ptr(Metric::CreateMetric(metric_type, config_)); - if (metric == nullptr) { continue; } + if (metric == nullptr) { + continue; + } metric->Init(valid_data->metadata(), valid_data->num_data()); valid_metrics_.back().push_back(std::move(metric)); } @@ -1603,7 +1607,9 @@ int LGBM_DatasetCreateFromCSC(const void* col_ptr, OMP_LOOP_EX_BEGIN(); const int tid = omp_get_thread_num(); int feature_idx = ret->InnerFeatureIndex(i); - if (feature_idx < 0) { continue; } + if (feature_idx < 0) { + continue; + } int group = ret->Feature2Group(feature_idx); int sub_feature = ret->Feture2SubFeature(feature_idx); CSC_RowIterator col_it(col_ptr, col_ptr_type, indices, data, data_type, ncol_ptr, nelem, i); @@ -1614,7 +1620,9 @@ int LGBM_DatasetCreateFromCSC(const void* col_ptr, auto pair = col_it.NextNonZero(); row_idx = pair.first; // no more data - if (row_idx < 0) { break; } + if (row_idx < 0) { + break; + } ret->PushOneData(tid, row_idx, group, feature_idx, sub_feature, pair.second); } } else { @@ -1838,7 +1846,9 @@ int LGBM_DatasetSetField(DatasetHandle handle, } else if (type == C_API_DTYPE_FLOAT64) { is_success = dataset->SetDoubleField(field_name, reinterpret_cast(field_data), static_cast(num_element)); } - if (!is_success) { Log::Fatal("Input data type error or field not found"); } + if (!is_success) { + Log::Fatal("Input data type error or field not found"); + } API_END(); } @@ -1875,8 +1885,12 @@ int LGBM_DatasetGetField(DatasetHandle handle, *out_type = C_API_DTYPE_FLOAT64; is_success = true; } - if (!is_success) { Log::Fatal("Field not found"); } - if (*out_ptr == nullptr) { *out_len = 0; } + if (!is_success) { + Log::Fatal("Field not found"); + } + if (*out_ptr == nullptr) { + *out_len = 0; + } API_END(); } diff --git a/src/io/bin.cpp b/src/io/bin.cpp index 3d84599e6589..23434d1932c9 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -131,7 +131,9 @@ namespace LightGBM { upper_bounds[bin_cnt] = distinct_values[i]; ++bin_cnt; lower_bounds[bin_cnt] = distinct_values[i + 1]; - if (bin_cnt >= max_bin - 1) { break; } + if (bin_cnt >= max_bin - 1) { + break; + } cur_cnt_inbin = 0; if (!is_big_count_value[i]) { --rest_bin_cnt; diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index 9c8a0417b118..4931a93d826e 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -665,7 +665,9 @@ Dataset* DatasetLoader::ConstructFromSampleData(double** sample_values, std::vector start(num_machines); std::vector len(num_machines); int step = (num_total_features + num_machines - 1) / num_machines; - if (step < 1) { step = 1; } + if (step < 1) { + step = 1; + } start[0] = 0; for (int i = 0; i < num_machines - 1; ++i) { @@ -1168,7 +1170,9 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, std::vector start(num_machines); std::vector len(num_machines); int step = (dataset->num_total_features_ + num_machines - 1) / num_machines; - if (step < 1) { step = 1; } + if (step < 1) { + step = 1; + } start[0] = 0; for (int i = 0; i < num_machines - 1; ++i) { @@ -1284,7 +1288,9 @@ void DatasetLoader::ExtractFeaturesFromMemory(std::vector* text_dat std::vector is_feature_added(dataset->num_features_, false); // push data for (auto& inner_data : oneline_features) { - if (inner_data.first >= dataset->num_total_features_) { continue; } + if (inner_data.first >= dataset->num_total_features_) { + continue; + } int feature_idx = dataset->used_feature_map_[inner_data.first]; if (feature_idx >= 0) { is_feature_added[feature_idx] = true; @@ -1341,7 +1347,9 @@ void DatasetLoader::ExtractFeaturesFromMemory(std::vector* text_dat // push data std::vector is_feature_added(dataset->num_features_, false); for (auto& inner_data : oneline_features) { - if (inner_data.first >= dataset->num_total_features_) { continue; } + if (inner_data.first >= dataset->num_total_features_) { + continue; + } int feature_idx = dataset->used_feature_map_[inner_data.first]; if (feature_idx >= 0) { is_feature_added[feature_idx] = true; @@ -1414,7 +1422,9 @@ void DatasetLoader::ExtractFeaturesFromFile(const char* filename, const Parser* std::vector is_feature_added(dataset->num_features_, false); // push data for (auto& inner_data : oneline_features) { - if (inner_data.first >= dataset->num_total_features_) { continue; } + if (inner_data.first >= dataset->num_total_features_) { + continue; + } int feature_idx = dataset->used_feature_map_[inner_data.first]; if (feature_idx >= 0) { is_feature_added[feature_idx] = true; diff --git a/src/io/metadata.cpp b/src/io/metadata.cpp index f6f07c434661..d5504454cdf7 100644 --- a/src/io/metadata.cpp +++ b/src/io/metadata.cpp @@ -56,7 +56,9 @@ void Metadata::Init(data_size_t num_data, int weight_idx, int query_idx) { Log::Info("Using query id in data file, ignoring the additional query file"); query_boundaries_.clear(); } - if (!query_weights_.empty()) { query_weights_.clear(); } + if (!query_weights_.empty()) { + query_weights_.clear(); + } queries_ = std::vector(num_data_, 0); query_load_from_file_ = false; } @@ -401,7 +403,9 @@ void Metadata::InsertInitScores(const double* init_scores, data_size_t start_ind // Note that len here is row count, not num_init_score, so we compare against num_data Log::Fatal("Inserted initial score data is too large for dataset"); } - if (init_score_.empty()) { init_score_.resize(num_init_score_); } + if (init_score_.empty()) { + init_score_.resize(num_init_score_); + } int nclasses = num_init_score_classes(); @@ -455,7 +459,9 @@ void Metadata::InsertLabels(const label_t* labels, data_size_t start_index, data if (start_index + len > num_data_) { Log::Fatal("Inserted label data is too large for dataset"); } - if (label_.empty()) { label_.resize(num_data_); } + if (label_.empty()) { + label_.resize(num_data_); + } memcpy(label_.data() + start_index, labels, sizeof(label_t) * len); @@ -511,7 +517,9 @@ void Metadata::InsertWeights(const label_t* weights, data_size_t start_index, da if (start_index + len > num_weights_) { Log::Fatal("Inserted weight data is too large for dataset"); } - if (weights_.empty()) { weights_.resize(num_weights_); } + if (weights_.empty()) { + weights_.resize(num_weights_); + } memcpy(weights_.data() + start_index, weights, sizeof(label_t) * len); @@ -797,20 +805,26 @@ void Metadata::LoadFromMemory(const void* memory) { num_queries_ = *(reinterpret_cast(mem_ptr)); mem_ptr += VirtualFileWriter::AlignedSize(sizeof(num_queries_)); - if (!label_.empty()) { label_.clear(); } + if (!label_.empty()) { + label_.clear(); + } label_ = std::vector(num_data_); std::memcpy(label_.data(), mem_ptr, sizeof(label_t) * num_data_); mem_ptr += VirtualFileWriter::AlignedSize(sizeof(label_t) * num_data_); if (num_weights_ > 0) { - if (!weights_.empty()) { weights_.clear(); } + if (!weights_.empty()) { + weights_.clear(); + } weights_ = std::vector(num_weights_); std::memcpy(weights_.data(), mem_ptr, sizeof(label_t) * num_weights_); mem_ptr += VirtualFileWriter::AlignedSize(sizeof(label_t) * num_weights_); weight_load_from_file_ = true; } if (num_queries_ > 0) { - if (!query_boundaries_.empty()) { query_boundaries_.clear(); } + if (!query_boundaries_.empty()) { + query_boundaries_.clear(); + } query_boundaries_ = std::vector(num_queries_ + 1); std::memcpy(query_boundaries_.data(), mem_ptr, sizeof(data_size_t) * (num_queries_ + 1)); mem_ptr += VirtualFileWriter::AlignedSize(sizeof(data_size_t) * diff --git a/src/metric/dcg_calculator.cpp b/src/metric/dcg_calculator.cpp index 316fdf0a6ddf..6af35c657e99 100644 --- a/src/metric/dcg_calculator.cpp +++ b/src/metric/dcg_calculator.cpp @@ -31,7 +31,9 @@ void DCGCalculator::DefaultEvalAt(std::vector* eval_at) { } void DCGCalculator::DefaultLabelGain(std::vector* label_gain) { - if (!label_gain->empty()) { return; } + if (!label_gain->empty()) { + return; + } // label_gain = 2^i - 1, may overflow, so we use 31 here const int max_label = 31; label_gain->push_back(0.0f); @@ -60,7 +62,9 @@ double DCGCalculator::CalMaxDCGAtK(data_size_t k, const label_t* label, data_siz } int top_label = static_cast(label_gain_.size()) - 1; - if (k > num_data) { k = num_data; } + if (k > num_data) { + k = num_data; + } // start from top label, and accumulate DCG for (data_size_t j = 0; j < k; ++j) { while (top_label > 0 && label_cnt[top_label] <= 0) { @@ -90,7 +94,9 @@ void DCGCalculator::CalMaxDCG(const std::vector& ks, // calculate k Max DCG by one pass for (size_t i = 0; i < ks.size(); ++i) { data_size_t cur_k = ks[i]; - if (cur_k > num_data) { cur_k = num_data; } + if (cur_k > num_data) { + cur_k = num_data; + } for (data_size_t j = cur_left; j < cur_k; ++j) { while (top_label > 0 && label_cnt[top_label] <= 0) { top_label -= 1; @@ -121,7 +127,9 @@ void DCGCalculator::CalDCG(const std::vector& ks, const label_t* la // calculate multi dcg by one pass for (size_t i = 0; i < ks.size(); ++i) { data_size_t cur_k = ks[i]; - if (cur_k > num_data) { cur_k = num_data; } + if (cur_k > num_data) { + cur_k = num_data; + } for (data_size_t j = cur_left; j < cur_k; ++j) { data_size_t idx = sorted_idx[j]; cur_result += label_gain_[static_cast(label[idx])] * discount_[j]; diff --git a/src/metric/map_metric.hpp b/src/metric/map_metric.hpp index 76fb4fe8a3e9..3f79dcc51888 100644 --- a/src/metric/map_metric.hpp +++ b/src/metric/map_metric.hpp @@ -86,7 +86,9 @@ class MapMetric:public Metric { data_size_t cur_left = 0; for (size_t i = 0; i < ks.size(); ++i) { data_size_t cur_k = static_cast(ks[i]); - if (cur_k > num_data) { cur_k = num_data; } + if (cur_k > num_data) { + cur_k = num_data; + } for (data_size_t j = cur_left; j < cur_k; ++j) { data_size_t idx = sorted_idx[j]; if (label[idx] > 0.5f) { diff --git a/src/network/linker_topo.cpp b/src/network/linker_topo.cpp index af46ef4f494e..6823ec6064b0 100644 --- a/src/network/linker_topo.cpp +++ b/src/network/linker_topo.cpp @@ -68,7 +68,9 @@ RecursiveHalvingMap::RecursiveHalvingMap(int in_k, RecursiveHalvingNodeType _typ RecursiveHalvingMap RecursiveHalvingMap::Construct(int rank, int num_machines) { // construct all recursive halving map for all machines int k = 0; - while ((1 << k) <= num_machines) { ++k; } + while ((1 << k) <= num_machines) { + ++k; + } // let 1 << k <= num_machines --k; // distance of each communication diff --git a/src/objective/rank_objective.hpp b/src/objective/rank_objective.hpp index 8227c7b65658..61d560bb6af8 100644 --- a/src/objective/rank_objective.hpp +++ b/src/objective/rank_objective.hpp @@ -206,11 +206,17 @@ class LambdarankNDCG : public RankingObjective { double sum_lambdas = 0.0; // start accumulate lambdas by pairs that contain at least one document above truncation level for (data_size_t i = 0; i < cnt - 1 && i < truncation_level_; ++i) { - if (score[sorted_idx[i]] == kMinScore) { continue; } + if (score[sorted_idx[i]] == kMinScore) { + continue; + } for (data_size_t j = i + 1; j < cnt; ++j) { - if (score[sorted_idx[j]] == kMinScore) { continue; } + if (score[sorted_idx[j]] == kMinScore) { + continue; + } // skip pairs with the same labels - if (label[sorted_idx[i]] == label[sorted_idx[j]]) { continue; } + if (label[sorted_idx[i]] == label[sorted_idx[j]]) { + continue; + } data_size_t high_rank, low_rank; if (label[sorted_idx[i]] > label[sorted_idx[j]]) { high_rank = i; diff --git a/src/treelearner/col_sampler.hpp b/src/treelearner/col_sampler.hpp index c70b07e50efa..0151337965f1 100644 --- a/src/treelearner/col_sampler.hpp +++ b/src/treelearner/col_sampler.hpp @@ -100,7 +100,9 @@ class ColSampler { allowed_features.insert(constraint.begin(), constraint.end()); } for (int feat : branch_features) { - if (constraint.count(feat) == 0) { break; } + if (constraint.count(feat) == 0) { + break; + } ++num_feat_found; if (num_feat_found == static_cast(branch_features.size())) { allowed_features.insert(constraint.begin(), constraint.end()); diff --git a/src/treelearner/data_parallel_tree_learner.cpp b/src/treelearner/data_parallel_tree_learner.cpp index 670788118455..deb74f9cde64 100644 --- a/src/treelearner/data_parallel_tree_learner.cpp +++ b/src/treelearner/data_parallel_tree_learner.cpp @@ -128,7 +128,9 @@ void DataParallelTreeLearner::BeforeTrain() { std::vector num_bins_distributed(num_machines_, 0); for (int i = 0; i < this->train_data_->num_total_features(); ++i) { int inner_feature_index = this->train_data_->InnerFeatureIndex(i); - if (inner_feature_index == -1) { continue; } + if (inner_feature_index == -1) { + continue; + } if (this->col_sampler_.is_feature_used_bytree()[inner_feature_index]) { int cur_min_machine = static_cast(ArrayArgs::ArgMin(num_bins_distributed)); feature_distribution[cur_min_machine].push_back(inner_feature_index); diff --git a/src/treelearner/feature_parallel_tree_learner.cpp b/src/treelearner/feature_parallel_tree_learner.cpp index c5202f3d706d..4d5324d8fd51 100644 --- a/src/treelearner/feature_parallel_tree_learner.cpp +++ b/src/treelearner/feature_parallel_tree_learner.cpp @@ -42,7 +42,9 @@ void FeatureParallelTreeLearner::BeforeTrain() { std::vector num_bins_distributed(num_machines_, 0); for (int i = 0; i < this->train_data_->num_total_features(); ++i) { int inner_feature_index = this->train_data_->InnerFeatureIndex(i); - if (inner_feature_index == -1) { continue; } + if (inner_feature_index == -1) { + continue; + } if (this->col_sampler_.is_feature_used_bytree()[inner_feature_index]) { int cur_min_machine = static_cast(ArrayArgs::ArgMin(num_bins_distributed)); feature_distribution[cur_min_machine].push_back(inner_feature_index); diff --git a/src/treelearner/gpu_tree_learner.cpp b/src/treelearner/gpu_tree_learner.cpp index 1bf21d65ccc6..8d0bdc5549d4 100644 --- a/src/treelearner/gpu_tree_learner.cpp +++ b/src/treelearner/gpu_tree_learner.cpp @@ -245,7 +245,7 @@ void GPUTreeLearner::AllocateGPUMemory() { } // allocate memory for all features (FIXME: 4 GB barrier on some devices, need to split to multiple buffers) device_features_.reset(); - device_features_ = std::unique_ptr>(new boost::compute::vector((uint64_t)num_dense_feature4_ * num_data_, ctx_)); + device_features_ = std::unique_ptr>(new boost::compute::vector(static_cast(num_dense_feature4_) * num_data_, ctx_)); // unpin old buffer if necessary before destructing them if (ptr_pinned_gradients_) { queue_.enqueue_unmap_buffer(pinned_gradients_, ptr_pinned_gradients_); @@ -427,7 +427,7 @@ void GPUTreeLearner::AllocateGPUMemory() { } #pragma omp critical queue_.enqueue_write_buffer(device_features_->get_buffer(), - (uint64_t)i * num_data_ * sizeof(Feature4), num_data_ * sizeof(Feature4), host4); + static_cast(i) * num_data_ * sizeof(Feature4), num_data_ * sizeof(Feature4), host4); #if GPU_DEBUG >= 1 printf("first example of feature-group tuple is: %d %d %d %d\n", host4[0].s[0], host4[0].s[1], host4[0].s[2], host4[0].s[3]); printf("Feature-groups copied to device with multipliers "); @@ -503,7 +503,7 @@ void GPUTreeLearner::AllocateGPUMemory() { } // copying the last 1 to (dword_features - 1) feature-groups in the last tuple queue_.enqueue_write_buffer(device_features_->get_buffer(), - (num_dense_feature4_ - 1) * (uint64_t)num_data_ * sizeof(Feature4), num_data_ * sizeof(Feature4), host4); + (num_dense_feature4_ - 1) * static_cast(num_data_) * sizeof(Feature4), num_data_ * sizeof(Feature4), host4); #if GPU_DEBUG >= 1 printf("Last features copied to device\n"); #endif @@ -1089,7 +1089,9 @@ void GPUTreeLearner::FindBestSplits(const Tree* tree) { size_t bin_size = train_data_->FeatureNumBin(feature_index) + 1; printf("Feature %d smaller leaf:\n", feature_index); PrintHistograms(smaller_leaf_histogram_array_[feature_index].RawData() - kHistOffset, bin_size); - if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->leaf_index() < 0) { continue; } + if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->leaf_index() < 0) { + continue; + } printf("Feature %d larger leaf:\n", feature_index); PrintHistograms(larger_leaf_histogram_array_[feature_index].RawData() - kHistOffset, bin_size); } diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 01cdd7623c02..4d62ae370791 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -368,12 +368,16 @@ bool SerialTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int larger_leaf_histogram_array_ = nullptr; } else if (num_data_in_left_child < num_data_in_right_child) { // put parent(left) leaf's histograms into larger leaf's histograms - if (histogram_pool_.Get(left_leaf, &larger_leaf_histogram_array_)) { parent_leaf_histogram_array_ = larger_leaf_histogram_array_; } + if (histogram_pool_.Get(left_leaf, &larger_leaf_histogram_array_)) { + parent_leaf_histogram_array_ = larger_leaf_histogram_array_; + } histogram_pool_.Move(left_leaf, right_leaf); histogram_pool_.Get(left_leaf, &smaller_leaf_histogram_array_); } else { // put parent(left) leaf's histograms to larger leaf's histograms - if (histogram_pool_.Get(left_leaf, &larger_leaf_histogram_array_)) { parent_leaf_histogram_array_ = larger_leaf_histogram_array_; } + if (histogram_pool_.Get(left_leaf, &larger_leaf_histogram_array_)) { + parent_leaf_histogram_array_ = larger_leaf_histogram_array_; + } histogram_pool_.Get(right_leaf, &smaller_leaf_histogram_array_); } return true; diff --git a/src/treelearner/voting_parallel_tree_learner.cpp b/src/treelearner/voting_parallel_tree_learner.cpp index 37f2d4cf2641..f18340634496 100644 --- a/src/treelearner/voting_parallel_tree_learner.cpp +++ b/src/treelearner/voting_parallel_tree_learner.cpp @@ -268,7 +268,9 @@ void VotingParallelTreeLearner::FindBestSplits(const Tree* tree) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) for (int feature_index = 0; feature_index < this->num_features_; ++feature_index) { OMP_LOOP_EX_BEGIN(); - if (!is_feature_used[feature_index]) { continue; } + if (!is_feature_used[feature_index]) { + continue; + } const BinMapper* feature_bin_mapper = this->train_data_->FeatureBinMapper(feature_index); const int num_bin = feature_bin_mapper->num_bin(); const int offset = static_cast(feature_bin_mapper->GetMostFreqBin() == 0); @@ -288,7 +290,9 @@ void VotingParallelTreeLearner::FindBestSplits(const Tree* tree) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) for (int feature_index = 0; feature_index < this->num_features_; ++feature_index) { OMP_LOOP_EX_BEGIN(); - if (!is_feature_used[feature_index]) { continue; } + if (!is_feature_used[feature_index]) { + continue; + } const BinMapper* feature_bin_mapper = this->train_data_->FeatureBinMapper(feature_index); const int num_bin = feature_bin_mapper->num_bin(); const int offset = static_cast(feature_bin_mapper->GetMostFreqBin() == 0); @@ -310,7 +314,9 @@ void VotingParallelTreeLearner::FindBestSplits(const Tree* tree) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) for (int feature_index = 0; feature_index < this->num_features_; ++feature_index) { OMP_LOOP_EX_BEGIN(); - if (!is_feature_used[feature_index]) { continue; } + if (!is_feature_used[feature_index]) { + continue; + } const int real_feature_index = this->train_data_->RealFeatureIndex(feature_index); this->train_data_->FixHistogram(feature_index, this->smaller_leaf_splits_->sum_gradients(), this->smaller_leaf_splits_->sum_hessians(), @@ -323,7 +329,9 @@ void VotingParallelTreeLearner::FindBestSplits(const Tree* tree) &smaller_bestsplit_per_features[feature_index], smaller_leaf_parent_output); // only has root leaf - if (this->larger_leaf_splits_ == nullptr || this->larger_leaf_splits_->leaf_index() < 0) { continue; } + if (this->larger_leaf_splits_ == nullptr || this->larger_leaf_splits_->leaf_index() < 0) { + continue; + } if (use_subtract) { this->larger_leaf_histogram_array_[feature_index].Subtract(this->smaller_leaf_histogram_array_[feature_index]); From eade0a9c3c9b905fccb5867e32a504ed229fcfe5 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 20 Aug 2025 22:18:52 -0500 Subject: [PATCH 2/4] remove bashisms --- .ci/check-omp-pragmas.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/check-omp-pragmas.sh b/.ci/check-omp-pragmas.sh index 42c2e52e0450..e9b08632900c 100755 --- a/.ci/check-omp-pragmas.sh +++ b/.ci/check-omp-pragmas.sh @@ -1,6 +1,6 @@ #!/bin/bash -set -e -E -u -o pipefail +set -e -u echo "checking that all OpenMP pragmas specify num_threads()" get_omp_pragmas_without_num_threads() { From bb2dd5b2bd21c2708dd75b87cc011a86c170645d Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 20 Aug 2025 23:11:53 -0500 Subject: [PATCH 3/4] one more pipefail use --- .ci/check-omp-pragmas.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/check-omp-pragmas.sh b/.ci/check-omp-pragmas.sh index e9b08632900c..8c7fcf084167 100755 --- a/.ci/check-omp-pragmas.sh +++ b/.ci/check-omp-pragmas.sh @@ -21,7 +21,7 @@ get_omp_pragmas_without_num_threads() { # consider this a failure and stop execution of the script. # # ref: https://www.gnu.org/software/grep/manual/html_node/Exit-Status.html -set +e +o pipefail +set +e PROBLEMATIC_LINES=$( get_omp_pragmas_without_num_threads ) From e7e7c64765f7de9e247726c5012bef7c78f43e35 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 20 Aug 2025 23:16:24 -0500 Subject: [PATCH 4/4] another pipefail --- .ci/check-omp-pragmas.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/check-omp-pragmas.sh b/.ci/check-omp-pragmas.sh index 8c7fcf084167..ac83b164fb0b 100755 --- a/.ci/check-omp-pragmas.sh +++ b/.ci/check-omp-pragmas.sh @@ -25,7 +25,7 @@ set +e PROBLEMATIC_LINES=$( get_omp_pragmas_without_num_threads ) -set -e -o pipefail +set -e if test "${PROBLEMATIC_LINES}" != ""; then get_omp_pragmas_without_num_threads echo "Found '#pragma omp parallel' not using explicit num_threads() configuration. Fix those."