diff --git a/CMakeLists.txt b/CMakeLists.txt index f4b94d25..e1a6f54c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,7 +8,7 @@ project(flann) string(TOLOWER ${PROJECT_NAME} PROJECT_NAME_LOWER) include(${PROJECT_SOURCE_DIR}/cmake/flann_utils.cmake) -set(FLANN_VERSION 1.9.1) +set(FLANN_VERSION 1.9.2) DISSECT_VERSION() GET_OS_INFO() @@ -148,6 +148,7 @@ endif(BUILD_CUDA_LIB) find_package(PkgConfig REQUIRED) pkg_check_modules(LZ4 REQUIRED liblz4) +include_directories(${LZ4_INCLUDE_DIRS}) #set the C/C++ include path to the "include" directory include_directories(BEFORE ${PROJECT_SOURCE_DIR}/src/cpp) diff --git a/ChangeLog b/ChangeLog index a5b07777..1e0399bd 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,25 @@ +Version 1.9.2 + * Removed redundant assignment (issue #422 @fluber) + * Removed unnecessary null checks before delete (issue #420 @elfring) + * Reverted PR #424 due to lack of portability + * fscanf fix (PR #467 @rdelfin) + * Out of bounds check (modified PR #455 @legrosbuffle, also included in PR #319 @seth-planet) + * Fixed MacOS build (PR #470 @johnhe4, modified for linux) + * Fixed build system with dummy.c hack (PR #457 @pemmanuelviel) + * Fixed misleading indentation in util/any.h (PR #365 @psteinb, also PR #428, #430, #459) + * Included datasets in repo + * Correct typo in definition (PR #419 @SergioRAgostinho) + * Fix typos (PR #279 @gadomski) + * CMakefile CUDA sources fix (PR #458 @pemmanuelviel) + * Documentation fix (PR #456 @pemmanuelviel) + * Scoping issue fix (PR #405 @greenbrettmichael, also PR #469, issue #386) + * Documentation fixes (PR #460 @pemmanuelviel) + * Changed return value (PR #461 @pemmanuelviel) + * Fixed CUDA crash - guarantee prealloc > 0 (PR #437 @neka-rat) + * Fixed wrong variable use (PR #433 @XinyiYS) + * Fixed RNG initialization (PR #424 @SiddhantRanade) + * Updated link to PDF (PR #474 @kleinma) + Version 1.6.11 * bug fixes diff --git a/README.md b/README.md index e5c02ab9..bac16034 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ Documentation on how to use the library can be found in the doc/manual.pdf file More information and experimental results can be found in the following paper: - * Marius Muja and David G. Lowe, "Fast Approximate Nearest Neighbors with Automatic Algorithm Configuration", in International Conference on Computer Vision Theory and Applications (VISAPP'09), 2009 [(PDF)](http://people.cs.ubc.ca/~mariusm/uploads/FLANN/flann_visapp09.pdf) [(BibTex)](http://people.cs.ubc.ca/~mariusm/index.php/FLANN/BibTex) + * Marius Muja and David G. Lowe, "Fast Approximate Nearest Neighbors with Automatic Algorithm Configuration", in International Conference on Computer Vision Theory and Applications (VISAPP'09), 2009 [(PDF)](https://www.cs.ubc.ca/research/flann/uploads/FLANN/flann_visapp09.pdf) [(BibTex)](http://people.cs.ubc.ca/~mariusm/index.php/FLANN/BibTex) Getting FLANN diff --git a/cmake/flann_utils.cmake b/cmake/flann_utils.cmake index afe4742d..a9f764dc 100644 --- a/cmake/flann_utils.cmake +++ b/cmake/flann_utils.cmake @@ -56,7 +56,7 @@ macro(flann_add_gtest exe) DEPENDS ${exe} WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/test VERBATIM - COMMENT "Runnint gtest test(s) ${exe}") + COMMENT "Running gtest test(s) ${exe}") # add dependency to 'test' target add_dependencies(flann_gtest test_${_testname}) endmacro(flann_add_gtest) @@ -76,7 +76,7 @@ macro(flann_add_cuda_gtest exe) DEPENDS ${exe} WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/test VERBATIM - COMMENT "Runnint gtest test(s) ${exe}") + COMMENT "Running gtest test(s) ${exe}") # add dependency to 'test' target add_dependencies(test test_${_testname}) endmacro(flann_add_cuda_gtest) @@ -100,18 +100,4 @@ macro(flann_add_pyunit file) # add dependency to 'test' target add_dependencies(pyunit_${_testname} flann) add_dependencies(test pyunit_${_testname}) -endmacro(flann_add_pyunit) - - - -macro(flann_download_test_data _name _md5) - string(REPLACE "/" "_" _dataset_name dataset_${_name}) - - add_custom_target(${_dataset_name} - COMMAND ${PYTHON_EXECUTABLE} ${PROJECT_SOURCE_DIR}/bin/download_checkmd5.py http://people.cs.ubc.ca/~mariusm/uploads/FLANN/datasets/${_name} ${TEST_OUTPUT_PATH}/${_name} ${_md5} - VERBATIM) - - # Also make sure that downloads are done before we run any tests - add_dependencies(tests ${_dataset_name}) - -endmacro(flann_download_test_data) +endmacro(flann_add_pyunit) \ No newline at end of file diff --git a/datasets/brief100K.h5 b/datasets/brief100K.h5 new file mode 100644 index 00000000..0c502b8d Binary files /dev/null and b/datasets/brief100K.h5 differ diff --git a/datasets/cloud.h5 b/datasets/cloud.h5 new file mode 100644 index 00000000..075f8cd5 Binary files /dev/null and b/datasets/cloud.h5 differ diff --git a/datasets/sift100K.h5 b/datasets/sift100K.h5 new file mode 100644 index 00000000..3add3730 Binary files /dev/null and b/datasets/sift100K.h5 differ diff --git a/datasets/sift100K_byte.h5 b/datasets/sift100K_byte.h5 new file mode 100644 index 00000000..0d51aee5 Binary files /dev/null and b/datasets/sift100K_byte.h5 differ diff --git a/datasets/sift10K.h5 b/datasets/sift10K.h5 new file mode 100644 index 00000000..fa5808f3 Binary files /dev/null and b/datasets/sift10K.h5 differ diff --git a/datasets/sift10K_byte.h5 b/datasets/sift10K_byte.h5 new file mode 100644 index 00000000..c79a26be Binary files /dev/null and b/datasets/sift10K_byte.h5 differ diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 3f4655a2..a2907739 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -1,9 +1,8 @@ add_custom_target(examples ALL) - if (BUILD_C_BINDINGS) add_executable(flann_example_c flann_example.c) - target_link_libraries(flann_example_c -Wl,--push-state,--no-as-needed ${LZ4_LIBRARIES} -Wl,--pop-state) + target_link_libraries(flann_example_c ${LZ4_LINK_LIBRARIES}) target_link_libraries(flann_example_c flann) set_target_properties(flann_example_c PROPERTIES COMPILE_FLAGS -std=c99) @@ -15,7 +14,7 @@ if (HDF5_FOUND) include_directories(${HDF5_INCLUDE_DIR}) add_executable(flann_example_cpp flann_example.cpp) - target_link_libraries(flann_example_cpp -Wl,--push-state,--no-as-needed ${LZ4_LIBRARIES} -Wl,--pop-state) + target_link_libraries(flann_example_cpp ${LZ4_LINK_LIBRARIES}) target_link_libraries(flann_example_cpp ${HDF5_LIBRARIES} flann_cpp) if (HDF5_IS_PARALLEL) target_link_libraries(flann_example_cpp ${MPI_LIBRARIES}) @@ -27,7 +26,7 @@ if (HDF5_FOUND) if (USE_MPI AND HDF5_IS_PARALLEL) add_executable(flann_example_mpi flann_example_mpi.cpp) - target_link_libraries(flann_example_mpi -Wl,--push-state,--no-as-needed ${LZ4_LIBRARIES} -Wl,--pop-state) + target_link_libraries(flann_example_mpi ${LZ4_LINK_LIBRARIES}) target_link_libraries(flann_example_mpi flann_cpp ${HDF5_LIBRARIES} ${MPI_LIBRARIES} ${Boost_LIBRARIES}) add_dependencies(examples flann_example_mpi) diff --git a/examples/flann_example.c b/examples/flann_example.c index e588359d..5d8d9700 100644 --- a/examples/flann_example.c +++ b/examples/flann_example.c @@ -28,8 +28,10 @@ float* read_points(const char* filename, int rows, int cols) for (i=0;i* struct AutotunedIndexParams : public IndexParams { - AutotunedIndexParams(float target_precision = 0.8, float build_weight = 0.01, float memory_weight = 0, float sample_fraction = 0.1) + AutotunedIndexParams(float target_precision = 0.8f, float build_weight = 0.01f, float memory_weight = 0.f, float sample_fraction = 0.1f) { (*this)["algorithm"] = FLANN_INDEX_AUTOTUNED; // precision desired (used for autotuning, -1 otherwise) @@ -161,7 +161,7 @@ class AutotunedIndex : public NNIndex } - void addPoints(const Matrix& points, float rebuild_threshold = 2) + void addPoints(const Matrix& points, float rebuild_threshold = 2.f) { if (bestIndex_) { bestIndex_->addPoints(points, rebuild_threshold); diff --git a/src/cpp/flann/algorithms/composite_index.h b/src/cpp/flann/algorithms/composite_index.h index 44ef7997..59f6051d 100644 --- a/src/cpp/flann/algorithms/composite_index.h +++ b/src/cpp/flann/algorithms/composite_index.h @@ -45,7 +45,7 @@ namespace flann struct CompositeIndexParams : public IndexParams { CompositeIndexParams(int trees = 4, int branching = 32, int iterations = 11, - flann_centers_init_t centers_init = FLANN_CENTERS_RANDOM, float cb_index = 0.2 ) + flann_centers_init_t centers_init = FLANN_CENTERS_RANDOM, float cb_index = 0.2f ) { (*this)["algorithm"] = FLANN_INDEX_KMEANS; // number of randomized trees to use (for kdtree) @@ -166,7 +166,7 @@ class CompositeIndex : public NNIndex kdtree_index_->buildIndex(); } - void addPoints(const Matrix& points, float rebuild_threshold = 2) + void addPoints(const Matrix& points, float rebuild_threshold = 2.f) { kmeans_index_->addPoints(points, rebuild_threshold); kdtree_index_->addPoints(points, rebuild_threshold); diff --git a/src/cpp/flann/algorithms/dist.h b/src/cpp/flann/algorithms/dist.h index 4e6eb73f..ff0093a5 100644 --- a/src/cpp/flann/algorithms/dist.h +++ b/src/cpp/flann/algorithms/dist.h @@ -477,6 +477,9 @@ struct HammingPopcnt ResultType operator()(Iterator1 a, Iterator2 b, size_t size, ResultType /*worst_dist*/ = -1) const { ResultType result = 0; + + //for portability just use unsigned long -- and use the __builtin_popcountll (see docs for __builtin_popcountll) + typedef unsigned long long pop_t; #if __GNUC__ #if ANDROID && HAVE_NEON static uint64_t features = android_getCpuFeatures(); @@ -499,8 +502,6 @@ struct HammingPopcnt } else #endif - //for portability just use unsigned long -- and use the __builtin_popcountll (see docs for __builtin_popcountll) - typedef unsigned long long pop_t; const size_t modulo = size % sizeof(pop_t); const pop_t* a2 = reinterpret_cast (a); const pop_t* b2 = reinterpret_cast (b); diff --git a/src/cpp/flann/algorithms/hierarchical_clustering_index.h b/src/cpp/flann/algorithms/hierarchical_clustering_index.h index 91db6368..0f257e1c 100644 --- a/src/cpp/flann/algorithms/hierarchical_clustering_index.h +++ b/src/cpp/flann/algorithms/hierarchical_clustering_index.h @@ -364,11 +364,14 @@ class HierarchicalClusteringIndex : public NNIndex * Node points (only for terminal nodes) */ std::vector points; - - Node(){ - pivot = NULL; - pivot_index = SIZE_MAX; - } + /** + * constructor + */ + Node() + { + pivot = NULL; + pivot_index = SIZE_MAX; + } /** * destructor * calling Node destructor explicitly @@ -378,7 +381,7 @@ class HierarchicalClusteringIndex : public NNIndex for(size_t i=0; i~Node(); pivot = NULL; - pivot_index = -1; + pivot_index = SIZE_MAX; } }; @@ -444,7 +447,9 @@ class HierarchicalClusteringIndex : public NNIndex { dst = new(pool_) Node(); dst->pivot_index = src->pivot_index; - dst->pivot = points_[dst->pivot_index]; + + if(dst->pivot_index != SIZE_MAX) + dst->pivot = points_[dst->pivot_index]; if (src->childs.size()==0) { dst->points = src->points; diff --git a/src/cpp/flann/algorithms/kdtree_cuda_3d_index.cu b/src/cpp/flann/algorithms/kdtree_cuda_3d_index.cu index 8465322e..48702533 100644 --- a/src/cpp/flann/algorithms/kdtree_cuda_3d_index.cu +++ b/src/cpp/flann/algorithms/kdtree_cuda_3d_index.cu @@ -42,6 +42,8 @@ #include #include #include +#include +#include #include #include namespace flann @@ -161,14 +163,27 @@ void nearestKernel(const cuda::kd_tree_builder_detail::SplitInfo* splits, template struct KDTreeCuda3dIndex::GpuHelper { - thrust::device_vector< cuda::kd_tree_builder_detail::SplitInfo >* gpu_splits_; - thrust::device_vector< int >* gpu_parent_; - thrust::device_vector< int >* gpu_child1_; - thrust::device_vector< float4 >* gpu_aabb_min_; - thrust::device_vector< float4 >* gpu_aabb_max_; - thrust::device_vector* gpu_points_; - thrust::device_vector* gpu_vind_; - GpuHelper() : gpu_splits_(0), gpu_parent_(0), gpu_child1_(0), gpu_aabb_min_(0), gpu_aabb_max_(0), gpu_points_(0), gpu_vind_(0){ + cudaStream_t gpu_stream; + bool use_ext_stream; + flann::cuda::device_vector_noinit< cuda::kd_tree_builder_detail::SplitInfo >* gpu_splits_; + flann::cuda::device_vector_noinit< int >* gpu_parent_; + flann::cuda::device_vector_noinit< int >* gpu_child1_; + flann::cuda::device_vector_noinit< float4 >* gpu_aabb_min_; + flann::cuda::device_vector_noinit< float4 >* gpu_aabb_max_; + flann::cuda::device_vector_noinit* gpu_points_; + flann::cuda::device_vector_noinit* gpu_vind_; + GpuHelper(cudaStream_t s = (cudaStream_t)0) : gpu_splits_(0), gpu_parent_(0), gpu_child1_(0), gpu_aabb_min_(0), gpu_aabb_max_(0), gpu_points_(0), gpu_vind_(0) + { + if (s == (cudaStream_t)0) + { + cudaStreamCreate(&gpu_stream); + use_ext_stream = false; + } + else + { + gpu_stream = s; + use_ext_stream = true; + } } ~GpuHelper() { @@ -187,6 +202,9 @@ struct KDTreeCuda3dIndex::GpuHelper delete gpu_points_; gpu_points_=0; + + if (use_ext_stream == false) + cudaStreamDestroy(gpu_stream); } }; @@ -302,15 +320,15 @@ void KDTreeCuda3dIndex::knnSearchGpu(const Matrix& querie // std::cout<<" knn:"< queriesDev(istride* queries.rows,0); - thrust::copy( queries.ptr(), queries.ptr()+istride*queries.rows, queriesDev.begin() ); - thrust::device_vector distsDev(queries.rows* ostride); - thrust::device_vector indicesDev(queries.rows* ostride); + flann::cuda::device_vector_noinit queriesDev(thrust::cuda::par.on(gpu_helper_->gpu_stream), istride* queries.rows,0); + thrust::detail::two_system_copy(thrust::host, thrust::cuda::par.on(gpu_helper_->gpu_stream), queries.ptr(), queries.ptr()+istride*queries.rows, queriesDev.begin() ); + flann::cuda::device_vector_noinit distsDev(queries.rows* ostride); + flann::cuda::device_vector_noinit indicesDev(queries.rows* ostride); if( knn==1 ) { - KdTreeCudaPrivate::nearestKernel<<>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), + KdTreeCudaPrivate::nearestKernel<<gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])), @@ -322,7 +340,7 @@ void KDTreeCuda3dIndex::knnSearchGpu(const Matrix& querie thrust::raw_pointer_cast(&indicesDev[0]), thrust::raw_pointer_cast(&distsDev[0]), queries.rows, flann::cuda::SingleResultSet(epsError),distance); - // KdTreeCudaPrivate::nearestKernel<<>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_nodes_)[0])), + // KdTreeCudaPrivate::nearestKernel<<gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_nodes_)[0])), // thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ), // thrust::raw_pointer_cast(&queriesDev[0]), // queries.stride, @@ -333,7 +351,7 @@ void KDTreeCuda3dIndex::knnSearchGpu(const Matrix& querie } else { if( use_heap ) { - KdTreeCudaPrivate::nearestKernel<<>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), + KdTreeCudaPrivate::nearestKernel<<gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])), @@ -348,7 +366,7 @@ void KDTreeCuda3dIndex::knnSearchGpu(const Matrix& querie , distance); } else { - KdTreeCudaPrivate::nearestKernel<<>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), + KdTreeCudaPrivate::nearestKernel<<gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])), @@ -364,9 +382,9 @@ void KDTreeCuda3dIndex::knnSearchGpu(const Matrix& querie ); } } - thrust::copy( distsDev.begin(), distsDev.end(), dists.ptr() ); - thrust::transform(indicesDev.begin(), indicesDev.end(), indicesDev.begin(), map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) )); - thrust::copy( indicesDev.begin(), indicesDev.end(), indices.ptr() ); + thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_helper_->gpu_stream), thrust::host, distsDev.begin(), distsDev.end(), dists.ptr() ); + thrust::transform(thrust::cuda::par.on(gpu_helper_->gpu_stream), indicesDev.begin(), indicesDev.end(), indicesDev.begin(), map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) )); + thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_helper_->gpu_stream), thrust::host, indicesDev.begin(), indicesDev.end(), indices.ptr() ); } else { thrust::device_ptr qd = thrust::device_pointer_cast(queries.ptr()); @@ -376,7 +394,7 @@ void KDTreeCuda3dIndex::knnSearchGpu(const Matrix& querie if( knn==1 ) { - KdTreeCudaPrivate::nearestKernel<<>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), + KdTreeCudaPrivate::nearestKernel<<gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])), @@ -388,7 +406,7 @@ void KDTreeCuda3dIndex::knnSearchGpu(const Matrix& querie id.get(), dd.get(), queries.rows, flann::cuda::SingleResultSet(epsError),distance); - // KdTreeCudaPrivate::nearestKernel<<>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_nodes_)[0])), + // KdTreeCudaPrivate::nearestKernel<<gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_nodes_)[0])), // thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ), // thrust::raw_pointer_cast(&queriesDev[0]), // queries.stride, @@ -399,7 +417,7 @@ void KDTreeCuda3dIndex::knnSearchGpu(const Matrix& querie } else { if( use_heap ) { - KdTreeCudaPrivate::nearestKernel<<>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), + KdTreeCudaPrivate::nearestKernel<<gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])), @@ -414,7 +432,7 @@ void KDTreeCuda3dIndex::knnSearchGpu(const Matrix& querie , distance); } else { - KdTreeCudaPrivate::nearestKernel<<>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), + KdTreeCudaPrivate::nearestKernel<<gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])), @@ -430,7 +448,7 @@ void KDTreeCuda3dIndex::knnSearchGpu(const Matrix& querie ); } } - thrust::transform(id, id+knn*queries.rows, id, map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) )); + thrust::transform(thrust::cuda::par.on(gpu_helper_->gpu_stream), id, id+ostride*queries.rows, id, map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) )); } } @@ -450,9 +468,9 @@ int KDTreeCuda3dIndex::radiusSearchGpu(const Matrix& que int istride=queries.stride/sizeof(ElementType); - thrust::device_vector queriesDev(istride* queries.rows,0); - thrust::copy( queries.ptr(), queries.ptr()+istride*queries.rows, queriesDev.begin() ); - thrust::device_vector countsDev(queries.rows); + flann::cuda::device_vector_noinit queriesDev(thrust::cuda::par.on(gpu_helper_->gpu_stream), istride* queries.rows,0); + thrust::detail::two_system_copy(thrust::host, thrust::cuda::par.on(gpu_helper_->gpu_stream), queries.ptr(), queries.ptr()+istride*queries.rows, queriesDev.begin() ); + flann::cuda::device_vector_noinit countsDev(queries.rows); typename GpuDistance::type distance; @@ -460,7 +478,7 @@ int KDTreeCuda3dIndex::radiusSearchGpu(const Matrix& que int blocksPerGrid=(queries.rows+threadsPerBlock-1)/threadsPerBlock; - KdTreeCudaPrivate::nearestKernel<<>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), + KdTreeCudaPrivate::nearestKernel<<gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])), @@ -475,7 +493,8 @@ int KDTreeCuda3dIndex::radiusSearchGpu(const Matrix& que distance ); - thrust::host_vector counts_host=countsDev; + thrust::host_vector counts_host(countsDev.size()); + thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_helper_->gpu_stream), thrust::host, countsDev.begin(), countsDev.end(), counts_host.begin()); if( max_neighbors!=0 ) { // we'll need this later, but the exclusive_scan will change the array for( size_t i=0; i::radiusSearchGpu(const Matrix& que } int neighbors_last_elem = countsDev.back(); - thrust::exclusive_scan( countsDev.begin(), countsDev.end(), countsDev.begin() ); + thrust::exclusive_scan(thrust::cuda::par.on(gpu_helper_->gpu_stream), countsDev.begin(), countsDev.end(), countsDev.begin() ); size_t total_neighbors=neighbors_last_elem+countsDev.back(); if( max_neighbors==0 ) return total_neighbors; - thrust::device_vector indicesDev(total_neighbors,-1); - thrust::device_vector distsDev(total_neighbors,std::numeric_limits::infinity()); + flann::cuda::device_vector_noinit indicesDev(thrust::cuda::par.on(gpu_helper_->gpu_stream), total_neighbors,-1); + flann::cuda::device_vector_noinit distsDev(thrust::cuda::par.on(gpu_helper_->gpu_stream), total_neighbors,std::numeric_limits::infinity()); if( max_neighbors<0 ) { - KdTreeCudaPrivate::nearestKernel<<>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), + KdTreeCudaPrivate::nearestKernel<<gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])), @@ -517,7 +536,7 @@ int KDTreeCuda3dIndex::radiusSearchGpu(const Matrix& que } else { if( use_heap ) { - KdTreeCudaPrivate::nearestKernel<<>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), + KdTreeCudaPrivate::nearestKernel<<gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])), @@ -531,7 +550,7 @@ int KDTreeCuda3dIndex::radiusSearchGpu(const Matrix& que queries.rows, flann::cuda::RadiusKnnResultSet(radius,max_neighbors, thrust::raw_pointer_cast(&countsDev[0]),sorted), distance); } else { - KdTreeCudaPrivate::nearestKernel<<>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), + KdTreeCudaPrivate::nearestKernel<<gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])), @@ -545,9 +564,11 @@ int KDTreeCuda3dIndex::radiusSearchGpu(const Matrix& que queries.rows, flann::cuda::RadiusKnnResultSet(radius,max_neighbors, thrust::raw_pointer_cast(&countsDev[0]),sorted), distance); } } - thrust::transform(indicesDev.begin(), indicesDev.end(), indicesDev.begin(), map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) )); - thrust::host_vector indices_temp = indicesDev; - thrust::host_vector dists_temp = distsDev; + thrust::transform(thrust::cuda::par.on(gpu_helper_->gpu_stream), indicesDev.begin(), indicesDev.end(), indicesDev.begin(), map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) )); + thrust::host_vector indices_temp(indicesDev.size()); + thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_helper_->gpu_stream), thrust::host, indicesDev.begin(), indicesDev.end(), indices_temp.begin()); + thrust::host_vector dists_temp(distsDev.size()); + thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_helper_->gpu_stream), thrust::host, distsDev.begin(), distsDev.end(), dists_temp.begin()); int buffer_index=0; for( size_t i=0; i::radiusSearchGpu(const Matrix& qu if( max_neighbors<0 ) max_neighbors=indices.cols; if( !matrices_on_gpu ) { - thrust::device_vector queriesDev(istride* queries.rows,0); - thrust::copy( queries.ptr(), queries.ptr()+istride*queries.rows, queriesDev.begin() ); + flann::cuda::device_vector_noinit queriesDev(thrust::cuda::par.on(gpu_helper_->gpu_stream), istride* queries.rows,0); + thrust::detail::two_system_copy(thrust::host, thrust::cuda::par.on(gpu_helper_->gpu_stream), queries.ptr(), queries.ptr()+istride*queries.rows, queriesDev.begin() ); typename GpuDistance::type distance; int threadsPerBlock = 128; int blocksPerGrid=(queries.rows+threadsPerBlock-1)/threadsPerBlock; if( max_neighbors== 0 ) { - thrust::device_vector indicesDev(queries.rows* ostride); - KdTreeCudaPrivate::nearestKernel<<>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), + flann::cuda::device_vector_noinit indicesDev(queries.rows* ostride); + KdTreeCudaPrivate::nearestKernel<<gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])), @@ -612,16 +633,16 @@ int KDTreeCuda3dIndex< Distance >::radiusSearchGpu(const Matrix& qu queries.rows, flann::cuda::CountingRadiusResultSet(radius,-1), distance ); - thrust::copy( indicesDev.begin(), indicesDev.end(), indices.ptr() ); - return thrust::reduce(indicesDev.begin(), indicesDev.end() ); + thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_helper_->gpu_stream), thrust::host, indicesDev.begin(), indicesDev.end(), indices.ptr() ); + return thrust::reduce(thrust::cuda::par.on(gpu_helper_->gpu_stream), indicesDev.begin(), indicesDev.end() ); } - thrust::device_vector distsDev(queries.rows* max_neighbors); - thrust::device_vector indicesDev(queries.rows* max_neighbors); + flann::cuda::device_vector_noinit distsDev(queries.rows* max_neighbors); + flann::cuda::device_vector_noinit indicesDev(queries.rows* max_neighbors); if( use_heap ) { - KdTreeCudaPrivate::nearestKernel<<>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), + KdTreeCudaPrivate::nearestKernel<<gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])), @@ -635,7 +656,7 @@ int KDTreeCuda3dIndex< Distance >::radiusSearchGpu(const Matrix& qu queries.rows, flann::cuda::KnnRadiusResultSet(max_neighbors,sorted,epsError, radius), distance); } else { - KdTreeCudaPrivate::nearestKernel<<>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), + KdTreeCudaPrivate::nearestKernel<<gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])), @@ -649,11 +670,11 @@ int KDTreeCuda3dIndex< Distance >::radiusSearchGpu(const Matrix& qu queries.rows, flann::cuda::KnnRadiusResultSet(max_neighbors,sorted,epsError, radius), distance); } - thrust::copy( distsDev.begin(), distsDev.end(), dists.ptr() ); - thrust::transform(indicesDev.begin(), indicesDev.end(), indicesDev.begin(), map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) )); - thrust::copy( indicesDev.begin(), indicesDev.end(), indices.ptr() ); + thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_helper_->gpu_stream), thrust::host, distsDev.begin(), distsDev.end(), dists.ptr() ); + thrust::transform(thrust::cuda::par.on(gpu_helper_->gpu_stream), indicesDev.begin(), indicesDev.end(), indicesDev.begin(), map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) )); + thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_helper_->gpu_stream), thrust::host, indicesDev.begin(), indicesDev.end(), indices.ptr() ); - return thrust::count_if(indicesDev.begin(), indicesDev.end(), isNotMinusOne() ); + return thrust::count_if(thrust::cuda::par.on(gpu_helper_->gpu_stream), indicesDev.begin(), indicesDev.end(), isNotMinusOne() ); } else { @@ -665,8 +686,8 @@ int KDTreeCuda3dIndex< Distance >::radiusSearchGpu(const Matrix& qu int blocksPerGrid=(queries.rows+threadsPerBlock-1)/threadsPerBlock; if( max_neighbors== 0 ) { - thrust::device_vector indicesDev(queries.rows* indices.stride); - KdTreeCudaPrivate::nearestKernel<<>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), + flann::cuda::device_vector_noinit indicesDev(queries.rows* indices.stride); + KdTreeCudaPrivate::nearestKernel<<gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])), @@ -680,12 +701,12 @@ int KDTreeCuda3dIndex< Distance >::radiusSearchGpu(const Matrix& qu queries.rows, flann::cuda::CountingRadiusResultSet(radius,-1), distance ); - thrust::copy( indicesDev.begin(), indicesDev.end(), indices.ptr() ); - return thrust::reduce(indicesDev.begin(), indicesDev.end() ); + thrust::copy(thrust::cuda::par.on(gpu_helper_->gpu_stream), indicesDev.begin(), indicesDev.end(), indices.ptr() ); + return thrust::reduce(thrust::cuda::par.on(gpu_helper_->gpu_stream), indicesDev.begin(), indicesDev.end() ); } if( use_heap ) { - KdTreeCudaPrivate::nearestKernel<<>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), + KdTreeCudaPrivate::nearestKernel<<gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])), @@ -699,7 +720,7 @@ int KDTreeCuda3dIndex< Distance >::radiusSearchGpu(const Matrix& qu queries.rows, flann::cuda::KnnRadiusResultSet(max_neighbors,sorted,epsError, radius), distance); } else { - KdTreeCudaPrivate::nearestKernel<<>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), + KdTreeCudaPrivate::nearestKernel<<gpu_stream>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])), thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])), @@ -713,9 +734,9 @@ int KDTreeCuda3dIndex< Distance >::radiusSearchGpu(const Matrix& qu queries.rows, flann::cuda::KnnRadiusResultSet(max_neighbors,sorted,epsError, radius), distance); } - thrust::transform(id, id+max_neighbors*queries.rows, id, map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) )); + thrust::transform(thrust::cuda::par.on(gpu_helper_->gpu_stream), id, id+max_neighbors*queries.rows, id, map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) )); - return thrust::count_if(id, id+max_neighbors*queries.rows, isNotMinusOne() ); + return thrust::count_if(thrust::cuda::par.on(gpu_helper_->gpu_stream), id, id+max_neighbors*queries.rows, isNotMinusOne() ); } } @@ -727,12 +748,13 @@ void KDTreeCuda3dIndex::uploadTreeToGpu() // (I would make this a (boost) static assertion, but so far flann seems to avoid boost // assert( sizeof( KdTreeCudaPrivate::GpuNode)==sizeof( Node ) ); delete gpu_helper_; - gpu_helper_ = new GpuHelper; - gpu_helper_->gpu_points_=new thrust::device_vector(size_); - thrust::device_vector tmp(size_); + cudaStream_t s = get_param(index_params_, "gpu_stream", (cudaStream_t)0); + gpu_helper_ = new GpuHelper(s); + gpu_helper_->gpu_points_=new flann::cuda::device_vector_noinit(size_); + flann::cuda::device_vector_noinit tmp(size_); if( get_param(index_params_,"input_is_gpu_float4",false) ) { assert( dataset_.cols == 3 && dataset_.stride==4*sizeof(float)); - thrust::copy( thrust::device_pointer_cast((float4*)dataset_.ptr()),thrust::device_pointer_cast((float4*)(dataset_.ptr()))+size_,tmp.begin()); + thrust::copy(thrust::cuda::par.on(gpu_helper_->gpu_stream), thrust::device_pointer_cast((float4*)dataset_.ptr()),thrust::device_pointer_cast((float4*)(dataset_.ptr()))+size_,tmp.begin()); } else { @@ -750,24 +772,24 @@ void KDTreeCuda3dIndex::uploadTreeToGpu() data_[i][j] = 0; } } - thrust::copy((float4*)data_.ptr(),(float4*)(data_.ptr())+size_,tmp.begin()); + thrust::detail::two_system_copy(thrust::host, thrust::cuda::par.on(gpu_helper_->gpu_stream), (float4*)data_.ptr(),(float4*)(data_.ptr())+size_,tmp.begin()); } - CudaKdTreeBuilder builder( tmp, leaf_max_size_ ); + CudaKdTreeBuilder builder( tmp, leaf_max_size_, gpu_helper_->gpu_stream ); builder.buildTree(); - gpu_helper_->gpu_splits_ = builder.splits_; - gpu_helper_->gpu_aabb_min_ = builder.aabb_min_; + gpu_helper_->gpu_splits_ = builder.splits_; + gpu_helper_->gpu_aabb_min_ = builder.aabb_min_; gpu_helper_->gpu_aabb_max_ = builder.aabb_max_; gpu_helper_->gpu_child1_ = builder.child1_; gpu_helper_->gpu_parent_=builder.parent_; gpu_helper_->gpu_vind_=builder.index_x_; - thrust::gather( builder.index_x_->begin(), builder.index_x_->end(), tmp.begin(), gpu_helper_->gpu_points_->begin()); + thrust::gather(thrust::cuda::par.on(gpu_helper_->gpu_stream), builder.index_x_->begin(), builder.index_x_->end(), tmp.begin(), gpu_helper_->gpu_points_->begin()); - // gpu_helper_->gpu_nodes_=new thrust::device_vector(node_count_); + // gpu_helper_->gpu_nodes_=new flann::cuda::device_vector_noinit(node_count_); - // gpu_helper_->gpu_vind_=new thrust::device_vector(size_); + // gpu_helper_->gpu_vind_=new flann::cuda::device_vector_noinit(size_); // thrust::copy( (KdTreeCudaPrivate::GpuNode*)&(tree_[0]), ((KdTreeCudaPrivate::GpuNode*)&(tree_[0]))+tree_.size(), gpu_helper_->gpu_nodes_->begin()); // thrust::copy(vind_.begin(),vind_.end(),gpu_helper_->gpu_vind_->begin()); diff --git a/src/cpp/flann/algorithms/kdtree_cuda_3d_index.h b/src/cpp/flann/algorithms/kdtree_cuda_3d_index.h index 897a3166..cdadbb70 100644 --- a/src/cpp/flann/algorithms/kdtree_cuda_3d_index.h +++ b/src/cpp/flann/algorithms/kdtree_cuda_3d_index.h @@ -87,7 +87,7 @@ class KDTreeCuda3dIndex : public NNIndex * params = parameters passed to the kdtree algorithm */ KDTreeCuda3dIndex(const Matrix& inputData, const IndexParams& params = KDTreeCuda3dIndexParams(), - Distance d = Distance() ) : BaseClass(params,d), dataset_(inputData), leaf_count_(0), visited_leafs(0), node_count_(0), current_node_count_(0) + Distance d = Distance() ) : BaseClass(params,d), visited_leafs(0), dataset_(inputData), leaf_count_(0), node_count_(0), current_node_count_(0) { size_ = dataset_.rows; dim_ = dataset_.cols; diff --git a/src/cpp/flann/algorithms/kdtree_cuda_builder.h b/src/cpp/flann/algorithms/kdtree_cuda_builder.h index 8bb4e838..d8294ca2 100644 --- a/src/cpp/flann/algorithms/kdtree_cuda_builder.h +++ b/src/cpp/flann/algorithms/kdtree_cuda_builder.h @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -43,7 +44,7 @@ namespace flann { // template< typename T > -// void print_vector( const thrust::device_vector& v ) +// void print_vector( const flann::cuda::device_vector_noinit& v ) // { // for( int i=0; i< v.size(); i++ ) // { @@ -52,7 +53,7 @@ namespace flann // } // // template< typename T1, typename T2 > -// void print_vector( const thrust::device_vector& v1, const thrust::device_vector& v2 ) +// void print_vector( const flann::cuda::device_vector_noinit& v1, const flann::cuda::device_vector_noinit& v2 ) // { // for( int i=0; i< v1.size(); i++ ) // { @@ -61,7 +62,7 @@ namespace flann // } // // template< typename T1, typename T2, typename T3 > -// void print_vector( const thrust::device_vector& v1, const thrust::device_vector& v2, const thrust::device_vector& v3 ) +// void print_vector( const flann::cuda::device_vector_noinit& v1, const flann::cuda::device_vector_noinit& v2, const flann::cuda::device_vector_noinit& v3 ) // { // for( int i=0; i< v1.size(); i++ ) // { @@ -70,7 +71,7 @@ namespace flann // } // // template< typename T > -// void print_vector_by_index( const thrust::device_vector& v,const thrust::device_vector& ind ) +// void print_vector_by_index( const flann::cuda::device_vector_noinit& v,const flann::cuda::device_vector_noinit& ind ) // { // for( int i=0; i< v.size(); i++ ) // { @@ -95,6 +96,108 @@ namespace flann // } namespace cuda { + // flann::cuda::device_vector_noinit is used to take place of thrust::device_vector + // as thrust::device_vector always uses default stream, and always contains a fill + template + class device_vector_noinit + { + private: + thrust::device_ptr m_ptr; + size_t m_size; + + public: + device_vector_noinit() + { + m_size = 0; + } + + ~device_vector_noinit() + { + if (m_size) + thrust::device_free(m_ptr); + m_size = 0; + } + + device_vector_noinit(size_t s) + { + m_size = s; + if (s) + m_ptr = thrust::device_malloc(s); + } + + template + device_vector_noinit(const thrust::detail::execution_policy_base &exec, size_t s, T t) + { + m_size = s; + if (s) + { + m_ptr = thrust::device_malloc(s); + thrust::fill(exec, m_ptr, m_ptr + m_size, t); + } + } + + size_t size() const + { + return m_size; + } + + void resize(size_t s) + { + if (m_size) + thrust::device_free(m_ptr); + m_size = s; + if (s) + m_ptr = thrust::device_malloc(s); + } + + template + void append(const thrust::detail::execution_policy_base &exec, size_t n, T t) + { + if (n == 0) + return; + + if (m_size) + { + thrust::device_ptr new_ptr = thrust::device_malloc(m_size + n); + + thrust::copy(exec, m_ptr, m_ptr + m_size, new_ptr); + thrust::fill(exec, new_ptr + m_size, new_ptr + m_size + n, t); + + thrust::device_free(m_ptr); + m_ptr = new_ptr; + + m_size = m_size + n; + } + else + { + resize(n); + thrust::fill(exec, m_ptr, m_ptr + n, t); + m_size = n; + } + } + + thrust::device_ptr begin() const + { + return m_ptr; + } + + thrust::device_ptr end() const + { + return m_ptr + m_size; + } + + auto back() const + { + return m_ptr[m_size - 1]; + } + + auto operator [] (const size_t i) const + { + return m_ptr[i]; + } + }; + + namespace kd_tree_builder_detail { //! normal node: contains the split dimension and value @@ -401,48 +504,52 @@ std::ostream& operator <<(std::ostream& stream, const cuda::kd_tree_builder_deta class CudaKdTreeBuilder { public: - CudaKdTreeBuilder( const thrust::device_vector& points, int max_leaf_size ) : /*out_of_space_(1,0),node_count_(1,1),*/ max_leaf_size_(max_leaf_size) + CudaKdTreeBuilder( const flann::cuda::device_vector_noinit& points, int max_leaf_size, cudaStream_t stream ) : /*out_of_space_(1,0),node_count_(1,1),*/ max_leaf_size_(max_leaf_size) { points_=&points; - int prealloc = points.size()/max_leaf_size_*16; - allocation_info_.resize(3); - allocation_info_[NodeCount]=1; - allocation_info_[NodesAllocated]=prealloc; - allocation_info_[OutOfSpace]=0; + gpu_stream = stream; + int prealloc = max((int)points.size()/max_leaf_size_*16, 1); + thrust::host_vector alloc_info(3); + alloc_info[0] = 1; + alloc_info[1] = prealloc; + alloc_info[2] = 0; + allocation_info_.resize(3); + thrust::detail::two_system_copy(thrust::host, thrust::cuda::par.on(gpu_stream), alloc_info.begin(), alloc_info.end(), allocation_info_.begin()); // std::cout<size()<(prealloc,-1); - parent_=new thrust::device_vector(prealloc,-1); - cuda::kd_tree_builder_detail::SplitInfo s; + child1_ = new flann::cuda::device_vector_noinit(thrust::cuda::par.on(gpu_stream), prealloc, -1); + parent_ = new flann::cuda::device_vector_noinit(thrust::cuda::par.on(gpu_stream), prealloc, -1); + cuda::kd_tree_builder_detail::SplitInfo s; s.left=0; s.right=0; - splits_=new thrust::device_vector(prealloc,s); - s.right=points.size(); - (*splits_)[0]=s; + splits_ = new flann::cuda::device_vector_noinit(thrust::cuda::par.on(gpu_stream), prealloc, s); + s.right=points.size(); + //(*splits_)[0]=s; + thrust::detail::two_system_copy(thrust::host, thrust::cuda::par.on(gpu_stream), &s, &s + 1, splits_->begin()); - aabb_min_=new thrust::device_vector(prealloc); - aabb_max_=new thrust::device_vector(prealloc); + aabb_min_=new flann::cuda::device_vector_noinit(prealloc); + aabb_max_=new flann::cuda::device_vector_noinit(prealloc); - index_x_=new thrust::device_vector(points_->size()); - index_y_=new thrust::device_vector(points_->size()); - index_z_=new thrust::device_vector(points_->size()); + index_x_=new flann::cuda::device_vector_noinit(points_->size()); + index_y_=new flann::cuda::device_vector_noinit(points_->size()); + index_z_=new flann::cuda::device_vector_noinit(points_->size()); - owners_x_=new thrust::device_vector(points_->size(),0); - owners_y_=new thrust::device_vector(points_->size(),0); - owners_z_=new thrust::device_vector(points_->size(),0); + owners_x_=new flann::cuda::device_vector_noinit(thrust::cuda::par.on(gpu_stream), points_->size(),0); + owners_y_=new flann::cuda::device_vector_noinit(thrust::cuda::par.on(gpu_stream), points_->size(),0); + owners_z_=new flann::cuda::device_vector_noinit(thrust::cuda::par.on(gpu_stream), points_->size(),0); - leftright_x_ = new thrust::device_vector(points_->size(),0); - leftright_y_ = new thrust::device_vector(points_->size(),0); - leftright_z_ = new thrust::device_vector(points_->size(),0); + leftright_x_ = new flann::cuda::device_vector_noinit(thrust::cuda::par.on(gpu_stream), points_->size(),0); + leftright_y_ = new flann::cuda::device_vector_noinit(thrust::cuda::par.on(gpu_stream), points_->size(),0); + leftright_z_ = new flann::cuda::device_vector_noinit(thrust::cuda::par.on(gpu_stream), points_->size(),0); - tmp_index_=new thrust::device_vector(points_->size()); - tmp_owners_=new thrust::device_vector(points_->size()); - tmp_misc_=new thrust::device_vector(points_->size()); + tmp_index_=new flann::cuda::device_vector_noinit(points_->size()); + tmp_owners_=new flann::cuda::device_vector_noinit(points_->size()); + tmp_misc_=new flann::cuda::device_vector_noinit(points_->size()); - points_x_=new thrust::device_vector(points_->size()); - points_y_=new thrust::device_vector(points_->size()); - points_z_=new thrust::device_vector(points_->size()); + points_x_=new flann::cuda::device_vector_noinit(points_->size()); + points_y_=new flann::cuda::device_vector_noinit(points_->size()); + points_z_=new flann::cuda::device_vector_noinit(points_->size()); delete_node_info_=false; } @@ -455,7 +562,7 @@ class CudaKdTreeBuilder delete aabb_min_; delete aabb_max_; delete index_x_; - } + } delete index_y_; delete index_z_; @@ -486,28 +593,49 @@ class CudaKdTreeBuilder // std::cout<<"buildTree()"<begin(), points_->end(), thrust::make_zip_iterator(thrust::make_tuple(points_x_->begin(), points_y_->begin(),points_z_->begin()) ), cuda::kd_tree_builder_detail::pointxyz_to_px_py_pz() ); + thrust::transform(thrust::cuda::par.on(gpu_stream), points_->begin(), points_->end(), thrust::make_zip_iterator(thrust::make_tuple(points_x_->begin(), points_y_->begin(),points_z_->begin()) ), cuda::kd_tree_builder_detail::pointxyz_to_px_py_pz() ); thrust::counting_iterator it(0); - thrust::copy( it, it+points_->size(), index_x_->begin() ); + thrust::copy(thrust::cuda::par.on(gpu_stream), it, it+points_->size(), index_x_->begin() ); - thrust::copy( index_x_->begin(), index_x_->end(), index_y_->begin() ); - thrust::copy( index_x_->begin(), index_x_->end(), index_z_->begin() ); + thrust::copy(thrust::cuda::par.on(gpu_stream), index_x_->begin(), index_x_->end(), index_y_->begin() ); + thrust::copy(thrust::cuda::par.on(gpu_stream), index_x_->begin(), index_x_->end(), index_z_->begin() ); - thrust::device_vector tmpv(points_->size()); + flann::cuda::device_vector_noinit tmpv(points_->size()); // create sorted index list -> can be used to compute AABBs in O(1) - thrust::copy(points_x_->begin(), points_x_->end(), tmpv.begin()); - thrust::sort_by_key( tmpv.begin(), tmpv.end(), index_x_->begin() ); - thrust::copy(points_y_->begin(), points_y_->end(), tmpv.begin()); - thrust::sort_by_key( tmpv.begin(), tmpv.end(), index_y_->begin() ); - thrust::copy(points_z_->begin(), points_z_->end(), tmpv.begin()); - thrust::sort_by_key( tmpv.begin(), tmpv.end(), index_z_->begin() ); - - - (*aabb_min_)[0]=make_float4((*points_x_)[(*index_x_)[0]],(*points_y_)[(*index_y_)[0]],(*points_z_)[(*index_z_)[0]],0); - - (*aabb_max_)[0]=make_float4((*points_x_)[(*index_x_)[points_->size()-1]],(*points_y_)[(*index_y_)[points_->size()-1]],(*points_z_)[(*index_z_)[points_->size()-1]],0); + thrust::copy(thrust::cuda::par.on(gpu_stream), points_x_->begin(), points_x_->end(), tmpv.begin()); + thrust::sort_by_key(thrust::cuda::par.on(gpu_stream), tmpv.begin(), tmpv.end(), index_x_->begin() ); + thrust::copy(thrust::cuda::par.on(gpu_stream), points_y_->begin(), points_y_->end(), tmpv.begin()); + thrust::sort_by_key(thrust::cuda::par.on(gpu_stream), tmpv.begin(), tmpv.end(), index_y_->begin() ); + thrust::copy(thrust::cuda::par.on(gpu_stream), points_z_->begin(), points_z_->end(), tmpv.begin()); + thrust::sort_by_key(thrust::cuda::par.on(gpu_stream), tmpv.begin(), tmpv.end(), index_z_->begin() ); + + int idxx, idxy, idxz; + float xx, yy, zz; + float4 xyzw; + + thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_stream), thrust::host, index_x_->begin(), index_x_->begin() + 1, &idxx); + thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_stream), thrust::host, index_y_->begin(), index_y_->begin() + 1, &idxy); + thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_stream), thrust::host, index_z_->begin(), index_z_->begin() + 1, &idxz); + thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_stream), thrust::host, points_x_->begin() + idxx, points_x_->begin() + idxx + 1, &xx); + thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_stream), thrust::host, points_y_->begin() + idxy, points_y_->begin() + idxy + 1, &yy); + thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_stream), thrust::host, points_z_->begin() + idxz, points_z_->begin() + idxz + 1, &zz); + xyzw = make_float4(xx, yy, zz, 0); + thrust::detail::two_system_copy(thrust::host, thrust::cuda::par.on(gpu_stream), &xyzw, &xyzw + 1, aabb_min_->begin()); + + thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_stream), thrust::host, index_x_->end() - 1, index_x_->end(), &idxx); + thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_stream), thrust::host, index_y_->end() - 1, index_y_->end(), &idxy); + thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_stream), thrust::host, index_z_->end() - 1, index_z_->end(), &idxz); + thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_stream), thrust::host, points_x_->begin() + idxx, points_x_->begin() + idxx + 1, &xx); + thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_stream), thrust::host, points_y_->begin() + idxy, points_y_->begin() + idxy + 1, &yy); + thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_stream), thrust::host, points_z_->begin() + idxz, points_z_->begin() + idxz + 1, &zz); + xyzw = make_float4(xx, yy, zz, 0); + thrust::detail::two_system_copy(thrust::host, thrust::cuda::par.on(gpu_stream), &xyzw, &xyzw + 1, aabb_max_->begin()); + + //(*aabb_min_)[0]=make_float4((*points_x_)[(*index_x_)[0]],(*points_y_)[(*index_y_)[0]],(*points_z_)[(*index_z_)[0]],0); + + //(*aabb_max_)[0]=make_float4((*points_x_)[(*index_x_)[points_->size()-1]],(*points_y_)[(*index_y_)[points_->size()-1]],(*points_z_)[(*index_z_)[points_->size()-1]],0); #ifdef PRINT_DEBUG_TIMING cudaDeviceSynchronize(); std::cout<<" initial stuff:"< cit(0); - thrust::for_each( thrust::make_zip_iterator(thrust::make_tuple( parent_->begin(), child1_->begin(), splits_->begin(), aabb_min_->begin(), aabb_max_->begin(), cit )), + thrust::for_each(thrust::cuda::par.on(gpu_stream), thrust::make_zip_iterator(thrust::make_tuple( parent_->begin(), child1_->begin(), splits_->begin(), aabb_min_->begin(), aabb_max_->begin(), cit )), thrust::make_zip_iterator(thrust::make_tuple( parent_->begin()+last_node_count, child1_->begin()+last_node_count,splits_->begin()+last_node_count, aabb_min_->begin()+last_node_count, aabb_max_->begin()+last_node_count,cit+last_node_count )), sn ); // copy allocation info to host - thrust::host_vector alloc_info = allocation_info_; + thrust::host_vector alloc_info(allocation_info_.size()); + thrust::detail::two_system_copy(thrust::cuda::par.on(gpu_stream), thrust::host, allocation_info_.begin(), allocation_info_.end(), alloc_info.begin()); if( last_node_count == alloc_info[NodeCount] ) { // no more nodes were split -> done break; @@ -542,7 +671,8 @@ class CudaKdTreeBuilder resize_node_vectors(alloc_info[NodesAllocated]*2); alloc_info[OutOfSpace]=0; alloc_info[NodesAllocated]*=2; - allocation_info_=alloc_info; + thrust::detail::two_system_copy(thrust::host, thrust::cuda::par.on(gpu_stream), alloc_info.begin(), alloc_info.end(), allocation_info_.begin()); + //allocation_info_=alloc_info; } #ifdef PRINT_DEBUG_TIMING cudaDeviceSynchronize(); @@ -564,7 +694,7 @@ class CudaKdTreeBuilder thrust::raw_pointer_cast(&(*leftright_z_)[0]) ); thrust::counting_iterator ci0(0); - thrust::for_each( thrust::make_zip_iterator( thrust::make_tuple( ci0, index_x_->begin(), index_y_->begin(), index_z_->begin()) ), + thrust::for_each(thrust::cuda::par.on(gpu_stream), thrust::make_zip_iterator( thrust::make_tuple( ci0, index_x_->begin(), index_y_->begin(), index_z_->begin()) ), thrust::make_zip_iterator( thrust::make_tuple( ci0+points_->size(), index_x_->end(), index_y_->end(), index_z_->end()) ),sno ); #ifdef PRINT_DEBUG_TIMING @@ -607,17 +737,17 @@ class CudaKdTreeBuilder //! takes the partitioned nodes, and sets the left-/right info of leaf nodes, as well as the AABBs void - update_leftright_and_aabb( const thrust::device_vector& x, const thrust::device_vector& y,const thrust::device_vector& z, - const thrust::device_vector& ix, const thrust::device_vector& iy,const thrust::device_vector& iz, - const thrust::device_vector& owners, - thrust::device_vector& splits, thrust::device_vector& aabbMin,thrust::device_vector& aabbMax) + update_leftright_and_aabb( const flann::cuda::device_vector_noinit& x, const flann::cuda::device_vector_noinit& y,const flann::cuda::device_vector_noinit& z, + const flann::cuda::device_vector_noinit& ix, const flann::cuda::device_vector_noinit& iy,const flann::cuda::device_vector_noinit& iz, + const flann::cuda::device_vector_noinit& owners, + flann::cuda::device_vector_noinit& splits, flann::cuda::device_vector_noinit& aabbMin,flann::cuda::device_vector_noinit& aabbMax) { - thrust::device_vector* labelsUnique=tmp_owners_; - thrust::device_vector* countsUnique=tmp_index_; + flann::cuda::device_vector_noinit* labelsUnique=tmp_owners_; + flann::cuda::device_vector_noinit* countsUnique=tmp_index_; // assume: points of each node are continuous in the array // find which nodes are here, and where each node's points begin and end - int unique_labels = thrust::unique_by_key_copy( owners.begin(), owners.end(), thrust::counting_iterator(0), labelsUnique->begin(), countsUnique->begin()).first - labelsUnique->begin(); + int unique_labels = thrust::unique_by_key_copy(thrust::cuda::par.on(gpu_stream), owners.begin(), owners.end(), thrust::counting_iterator(0), labelsUnique->begin(), countsUnique->begin()).first - labelsUnique->begin(); // update the info cuda::kd_tree_builder_detail::SetLeftAndRightAndAABB s; @@ -636,7 +766,7 @@ class CudaKdTreeBuilder s.aabbMax=thrust::raw_pointer_cast(&aabbMax[0]); thrust::counting_iterator it(0); - thrust::for_each(it, it+unique_labels, s); + thrust::for_each(thrust::cuda::par.on(gpu_stream), it, it+unique_labels, s); } //! Separates the left and right children of each node into continuous parts of the array. @@ -646,12 +776,12 @@ class CudaKdTreeBuilder //! for all the single nodes. //! (basically the split primitive according to sengupta et al) //! about twice as fast as thrust::partition - void separate_left_and_right_children( thrust::device_vector& key_in, thrust::device_vector& val_in, thrust::device_vector& key_out, thrust::device_vector& val_out, thrust::device_vector& left_right_marks, bool scatter_val_out=true ) + void separate_left_and_right_children( flann::cuda::device_vector_noinit& key_in, flann::cuda::device_vector_noinit& val_in, flann::cuda::device_vector_noinit& key_out, flann::cuda::device_vector_noinit& val_out, flann::cuda::device_vector_noinit& left_right_marks, bool scatter_val_out=true ) { - thrust::device_vector* f_tmp = &val_out; - thrust::device_vector* addr_tmp = tmp_misc_; + flann::cuda::device_vector_noinit* f_tmp = &val_out; + flann::cuda::device_vector_noinit* addr_tmp = tmp_misc_; - thrust::exclusive_scan( /*thrust::make_transform_iterator(*/ left_right_marks.begin() /*,cuda::kd_tree_builder_detail::IsEven*/ + thrust::exclusive_scan(thrust::cuda::par.on(gpu_stream), /*thrust::make_transform_iterator(*/ left_right_marks.begin() /*,cuda::kd_tree_builder_detail::IsEven*/ /*())*/, /*thrust::make_transform_iterator(*/ left_right_marks.end() /*,cuda::kd_tree_builder_detail::IsEven*/ /*())*/, f_tmp->begin() ); cuda::kd_tree_builder_detail::set_addr3 sa; @@ -659,10 +789,10 @@ class CudaKdTreeBuilder sa.f_=thrust::raw_pointer_cast(&(*f_tmp)[0]); sa.npoints_=key_in.size(); thrust::counting_iterator it(0); - thrust::transform(it, it+val_in.size(), addr_tmp->begin(), sa); + thrust::transform(thrust::cuda::par.on(gpu_stream), it, it+val_in.size(), addr_tmp->begin(), sa); - thrust::scatter(key_in.begin(), key_in.end(), addr_tmp->begin(), key_out.begin()); - if( scatter_val_out ) thrust::scatter(val_in.begin(), val_in.end(), addr_tmp->begin(), val_out.begin()); + thrust::scatter(thrust::cuda::par.on(gpu_stream), key_in.begin(), key_in.end(), addr_tmp->begin(), key_out.begin()); + if( scatter_val_out ) thrust::scatter(thrust::cuda::par.on(gpu_stream), val_in.begin(), val_in.end(), addr_tmp->begin(), val_out.begin()); } //! allocates additional space in all the node-related vectors. @@ -670,33 +800,34 @@ class CudaKdTreeBuilder void resize_node_vectors( size_t new_size ) { size_t add = new_size - child1_->size(); - child1_->insert(child1_->end(), add, -1); - parent_->insert(parent_->end(), add, -1); + child1_->append(thrust::cuda::par.on(gpu_stream), add, -1); + parent_->append(thrust::cuda::par.on(gpu_stream), add, -1); cuda::kd_tree_builder_detail::SplitInfo s; s.left=0; s.right=0; - splits_->insert(splits_->end(), add, s); - float4 f; - aabb_min_->insert(aabb_min_->end(), add, f); - aabb_max_->insert(aabb_max_->end(), add, f); + splits_->append(thrust::cuda::par.on(gpu_stream), add, s); + float4 f=make_float4(0.0f, 0.0f, 0.0f, 0.0f); + aabb_min_->append(thrust::cuda::par.on(gpu_stream), add, f); + aabb_max_->append(thrust::cuda::par.on(gpu_stream), add, f); } + cudaStream_t gpu_stream; - const thrust::device_vector* points_; + const flann::cuda::device_vector_noinit* points_; // tree data, those are stored per-node //! left child of each node. (right child==left child + 1, due to the alloc mechanism) //! child1_[node]==-1 if node is a leaf node - thrust::device_vector* child1_; + flann::cuda::device_vector_noinit* child1_; //! parent node of each node - thrust::device_vector* parent_; + flann::cuda::device_vector_noinit* parent_; //! split info (dim/value or left/right pointers) - thrust::device_vector* splits_; + flann::cuda::device_vector_noinit* splits_; //! min aabb value of each node - thrust::device_vector* aabb_min_; + flann::cuda::device_vector_noinit* aabb_min_; //! max aabb value of each node - thrust::device_vector* aabb_max_; + flann::cuda::device_vector_noinit* aabb_max_; enum AllocationInfo { @@ -705,25 +836,25 @@ class CudaKdTreeBuilder OutOfSpace=2 }; // those were put into a single vector of 3 elements so that only one mem transfer will be needed for all three of them - // thrust::device_vector out_of_space_; - // thrust::device_vector node_count_; - // thrust::device_vector nodes_allocated_; - thrust::device_vector allocation_info_; + // flann::cuda::device_vector_noinit out_of_space_; + // flann::cuda::device_vector_noinit node_count_; + // flann::cuda::device_vector_noinit nodes_allocated_; + flann::cuda::device_vector_noinit allocation_info_; int max_leaf_size_; // coordinate values of the points - thrust::device_vector* points_x_, * points_y_, * points_z_; + flann::cuda::device_vector_noinit* points_x_, * points_y_, * points_z_; // indices - thrust::device_vector* index_x_, * index_y_, * index_z_; + flann::cuda::device_vector_noinit* index_x_, * index_y_, * index_z_; // owner node - thrust::device_vector* owners_x_, * owners_y_, * owners_z_; + flann::cuda::device_vector_noinit* owners_x_, * owners_y_, * owners_z_; // contains info about whether a point was partitioned to the left or right child after a split - thrust::device_vector* leftright_x_, * leftright_y_, * leftright_z_; - thrust::device_vector* tmp_index_, * tmp_owners_, * tmp_misc_; + flann::cuda::device_vector_noinit* leftright_x_, * leftright_y_, * leftright_z_; + flann::cuda::device_vector_noinit* tmp_index_, * tmp_owners_, * tmp_misc_; bool delete_node_info_; }; } // namespace flann -#endif \ No newline at end of file +#endif diff --git a/src/cpp/flann/algorithms/kmeans_index.h b/src/cpp/flann/algorithms/kmeans_index.h index c25dabd6..bd2bbe3e 100644 --- a/src/cpp/flann/algorithms/kmeans_index.h +++ b/src/cpp/flann/algorithms/kmeans_index.h @@ -58,7 +58,7 @@ namespace flann struct KMeansIndexParams : public IndexParams { KMeansIndexParams(int branching = 32, int iterations = 11, - flann_centers_init_t centers_init = FLANN_CENTERS_RANDOM, float cb_index = 0.2 ) + flann_centers_init_t centers_init = FLANN_CENTERS_RANDOM, float cb_index = 0.2f ) { (*this)["algorithm"] = FLANN_INDEX_KMEANS; // branching factor @@ -212,7 +212,7 @@ class KMeansIndex : public NNIndex using BaseClass::buildIndex; - void addPoints(const Matrix& points, float rebuild_threshold = 2) + void addPoints(const Matrix& points, float rebuild_threshold = 2.f) { assert(points.cols==veclen_); size_t old_size = size_; diff --git a/src/cpp/flann/algorithms/nn_index.h b/src/cpp/flann/algorithms/nn_index.h index 245d258c..370b42c4 100644 --- a/src/cpp/flann/algorithms/nn_index.h +++ b/src/cpp/flann/algorithms/nn_index.h @@ -111,9 +111,7 @@ class NNIndex : public IndexBase virtual ~NNIndex() { - if (data_ptr_) { - delete[] data_ptr_; - } + delete[] data_ptr_; } @@ -266,9 +264,7 @@ class NNIndex : public IndexBase if (save_dataset) { if (Archive::is_loading::value) { - if (data_ptr_) { - delete[] data_ptr_; - } + delete[] data_ptr_; data_ptr_ = new ElementType[size_*veclen_]; points_.resize(size_); for (size_t i=0;i& queries, + virtual int knnSearch(const Matrix& queries, Matrix& indices, Matrix& dists, size_t knn, @@ -392,7 +388,7 @@ class NNIndex : public IndexBase * @param[in] knn Number of nearest neighbors to return * @param[in] params Search parameters */ - int knnSearch(const Matrix& queries, + virtual int knnSearch(const Matrix& queries, std::vector< std::vector >& indices, std::vector >& dists, size_t knn, @@ -463,7 +459,7 @@ class NNIndex : public IndexBase * @param params * @return */ - int knnSearch(const Matrix& queries, + virtual int knnSearch(const Matrix& queries, std::vector< std::vector >& indices, std::vector >& dists, size_t knn, @@ -488,7 +484,7 @@ class NNIndex : public IndexBase * @param[in] params Search parameters * @return Number of neighbors found */ - int radiusSearch(const Matrix& queries, + virtual int radiusSearch(const Matrix& queries, Matrix& indices, Matrix& dists, float radius, @@ -571,7 +567,7 @@ class NNIndex : public IndexBase * @param params * @return */ - int radiusSearch(const Matrix& queries, + virtual int radiusSearch(const Matrix& queries, Matrix& indices, Matrix& dists, float radius, @@ -598,7 +594,7 @@ class NNIndex : public IndexBase * @param[in] params Search parameters * @return Number of neighbors found */ - int radiusSearch(const Matrix& queries, + virtual int radiusSearch(const Matrix& queries, std::vector< std::vector >& indices, std::vector >& dists, float radius, @@ -677,7 +673,7 @@ class NNIndex : public IndexBase * @param params * @return */ - int radiusSearch(const Matrix& queries, + virtual int radiusSearch(const Matrix& queries, std::vector< std::vector >& indices, std::vector >& dists, float radius, diff --git a/src/cpp/flann/config.h b/src/cpp/flann/config.h index 853cfaa2..36762e9d 100644 --- a/src/cpp/flann/config.h +++ b/src/cpp/flann/config.h @@ -33,7 +33,7 @@ #ifdef FLANN_VERSION_ #undef FLANN_VERSION_ #endif -#define FLANN_VERSION_ "1.9.1" +#define FLANN_VERSION_ "1.9.2" #ifdef FLANN_VERSION_MAJOR_ #undef FLANN_VERSION_MAJOR_ @@ -43,12 +43,12 @@ #ifdef FLANN_VERSION_MINOR_ #undef FLANN_VERSION_MINOR_ #endif -#define FLANN_MINOR_VERSION_ 9 +#define FLANN_VERSION_MINOR_ 9 #ifdef FLANN_VERSION_PATCH_ #undef FLANN_VERSION_PATCH_ #endif -#define FLANN_VERSION_PATCH_ 1 +#define FLANN_VERSION_PATCH_ 2 #endif /* FLANN_CONFIG_H_ */ diff --git a/src/cpp/flann/config.h.in b/src/cpp/flann/config.h.in index 2d087aca..2efd3829 100644 --- a/src/cpp/flann/config.h.in +++ b/src/cpp/flann/config.h.in @@ -43,7 +43,7 @@ #ifdef FLANN_VERSION_MINOR_ #undef FLANN_VERSION_MINOR_ #endif -#define FLANN_MINOR_VERSION_ ${FLANN_VERSION_MINOR} +#define FLANN_VERSION_MINOR_ ${FLANN_VERSION_MINOR} #ifdef FLANN_VERSION_PATCH_ #undef FLANN_VERSION_PATCH_ diff --git a/src/cpp/flann/flann.hpp b/src/cpp/flann/flann.hpp index 3bfe8079..8c582a96 100644 --- a/src/cpp/flann/flann.hpp +++ b/src/cpp/flann/flann.hpp @@ -376,6 +376,7 @@ class Index } IndexHeader header = load_header(fin); if (header.h.data_type != flann_datatype_value::value) { + fclose(fin); throw FLANNException("Datatype of saved index is different than of the one to be loaded."); } diff --git a/src/cpp/flann/util/any.h b/src/cpp/flann/util/any.h index 8014f6a4..be1a594e 100644 --- a/src/cpp/flann/util/any.h +++ b/src/cpp/flann/util/any.h @@ -78,7 +78,10 @@ struct big_any_policy : typed_base_any_policy { virtual void static_delete(void** x) { - if (* x) delete (* reinterpret_cast(x)); *x = NULL; + if (* x) { + delete (* reinterpret_cast(x)); + *x = NULL; + } } virtual void copy_from_value(void const* src, void** dest) { diff --git a/src/cpp/flann/util/dynamic_bitset.h b/src/cpp/flann/util/dynamic_bitset.h index f5812fce..189112c4 100644 --- a/src/cpp/flann/util/dynamic_bitset.h +++ b/src/cpp/flann/util/dynamic_bitset.h @@ -52,13 +52,13 @@ namespace flann { class DynamicBitset { public: - /** @param default constructor + /** default constructor */ DynamicBitset() : size_(0) { } - /** @param only constructor we use in our code + /** only constructor we use in our code * @param the size of the bitset (in bits) */ DynamicBitset(size_t size) @@ -82,7 +82,7 @@ class DynamicBitset return bitset_.empty(); } - /** @param set all the bits to 0 + /** set all the bits to 0 */ void reset() { @@ -90,7 +90,7 @@ class DynamicBitset } /** @brief set one bit to 0 - * @param + * @param index */ void reset(size_t index) { @@ -101,14 +101,14 @@ class DynamicBitset * This function is useful when resetting a given set of bits so that the * whole bitset ends up being 0: if that's the case, we don't care about setting * other bits to 0 - * @param + * @param index */ void reset_block(size_t index) { bitset_[index / cell_bit_size_] = 0; } - /** @param resize the bitset so that it contains at least size bits + /** resize the bitset so that it contains at least size bits * @param size */ void resize(size_t size) @@ -117,7 +117,7 @@ class DynamicBitset bitset_.resize(size / cell_bit_size_ + 1); } - /** @param set a bit to true + /** set a bit to true * @param index the index of the bit to set to 1 */ void set(size_t index) @@ -125,14 +125,14 @@ class DynamicBitset bitset_[index / cell_bit_size_] |= size_t(1) << (index % cell_bit_size_); } - /** @param gives the number of contained bits + /** gives the number of contained bits */ size_t size() const { return size_; } - /** @param check if a bit is set + /** check if a bit is set * @param index the index of the bit to check * @return true if the bit is set */ diff --git a/src/cpp/flann/util/lsh_table.h b/src/cpp/flann/util/lsh_table.h index 58c644ec..6f61ecd6 100644 --- a/src/cpp/flann/util/lsh_table.h +++ b/src/cpp/flann/util/lsh_table.h @@ -235,7 +235,7 @@ class LshTable size_t getKey(const ElementType* /*feature*/) const { std::cerr << "LSH is not implemented for that type" << std::endl; - return -1; + return 0; } /** Get statistics about the table diff --git a/src/cpp/flann/util/result_set.h b/src/cpp/flann/util/result_set.h index 90e5643c..a59ae404 100644 --- a/src/cpp/flann/util/result_set.h +++ b/src/cpp/flann/util/result_set.h @@ -107,7 +107,7 @@ class KNNSimpleResultSet : public ResultSet capacity_(capacity_) { // reserving capacity to prevent memory re-allocations - dist_index_.resize(capacity_, DistIndex(std::numeric_limits::max(),-1)); + dist_index_.resize(capacity_, DistIndex(std::numeric_limits::max(),std::numeric_limits::max())); clear(); } @@ -210,7 +210,7 @@ class KNNResultSet : public ResultSet KNNResultSet(int capacity) : capacity_(capacity) { // reserving capacity to prevent memory re-allocations - dist_index_.resize(capacity_, DistIndex(std::numeric_limits::max(),-1)); + dist_index_.resize(capacity_, DistIndex(std::numeric_limits::max(),std::numeric_limits::max())); clear(); } @@ -252,7 +252,7 @@ class KNNResultSet : public ResultSet #endif { // Check for duplicate indices - for (size_t j = i - 1; dist_index_[j].dist_ == dist && j--;) { + for (size_t j = i; j-- && dist_index_[j].dist_ == dist;) { if (dist_index_[j].index_ == index) { return; } @@ -845,7 +845,7 @@ class RadiusUniqueResultSet : public UniqueResultSet { public: /** Constructor - * @param capacity the number of neighbors to store at max + * @param radius the maximum distance of a neighbor */ RadiusUniqueResultSet(DistanceType radius) : radius_(radius) @@ -904,6 +904,7 @@ class KNNRadiusUniqueResultSet : public KNNUniqueResultSet { public: /** Constructor + * @param radius the maximum distance of a neighbor * @param capacity the number of neighbors to store at max */ KNNRadiusUniqueResultSet(DistanceType radius, size_t capacity) : KNNUniqueResultSet(capacity) diff --git a/src/cpp/flann/util/serialization.h b/src/cpp/flann/util/serialization.h index cdc2e5ea..ac4484b6 100644 --- a/src/cpp/flann/util/serialization.h +++ b/src/cpp/flann/util/serialization.h @@ -723,7 +723,7 @@ class LoadArchive : public InputArchive // If not v1.0 format hack... if (buffer_blocks_ != NULL) { // Read the last '0' in the file - size_t zero = -1; + size_t zero = 1; if (fread(&zero, sizeof(zero), 1, stream_) != 1) { throw FLANNException("Invalid index file, cannot read from disk (end)"); } diff --git a/src/python/pyflann/index.py b/src/python/pyflann/index.py index fb6beae9..d6e59ac3 100644 --- a/src/python/pyflann/index.py +++ b/src/python/pyflann/index.py @@ -110,7 +110,7 @@ def nn(self, pts, qpts, num_neighbors=1, **kwargs): raise FLANNException('Cannot handle type: %s' % pts.dtype) if qpts.dtype.type not in allowed_types: - raise FLANNException('Cannot handle type: %s' % pts.dtype) + raise FLANNException('Cannot handle type: %s' % qpts.dtype) if pts.dtype != qpts.dtype: raise FLANNException('Data and query must have the same type') diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index f21b58ff..ca848c7f 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -10,15 +10,6 @@ add_dependencies(test tests) set(EXECUTABLE_OUTPUT_PATH ${TEST_OUTPUT_PATH}) -if (PYTHON_EXECUTABLE) - flann_download_test_data(sift10K.h5 0964a910946d2dd5fe28337507a8abc3) - flann_download_test_data(sift10K_byte.h5 f835e0148df4618a81f67febfda2b4d0) - flann_download_test_data(sift100K.h5 ae2b08f93f3d9f89f5d68566b0406102) - flann_download_test_data(sift100K_byte.h5 b772255fd2044e9d2a5a0183953e4705) - flann_download_test_data(cloud.h5 dfc77bad391a3ae739a9874f4a5dc0d7) - flann_download_test_data(brief100K.h5 e1e781c0955917bc2f0a27b6344c2342) -endif() - if (GTEST_FOUND AND HDF5_FOUND) include_directories(${HDF5_INCLUDE_DIR}) @@ -56,7 +47,7 @@ if (GTEST_FOUND AND HDF5_FOUND) endif() if (GTEST_FOUND AND HDF5_FOUND AND BUILD_CUDA_LIB) - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xcompiler;-fPIC;-arch=sm_13" ) + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xcompiler;-fPIC;-Xcudafe \"--diag_suppress=partial_override\" ;-gencode=arch=compute_52,code=\"sm_52,compute_52\";-gencode=arch=compute_61,code=\"sm_61,compute_61\"" ) if (NVCC_COMPILER_BINDIR) set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};--compiler-bindir=${NVCC_COMPILER_BINDIR}") endif() @@ -75,9 +66,9 @@ if (BUILD_PYTHON_BINDINGS) endif() #---------- ruby spec ---------------- -if (BUILD_C_BINDINGS) - add_custom_target(flann_ruby_spec - COMMAND bundle exec rake spec - WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/src/ruby) - add_dependencies(test flann_ruby_spec) -endif() +#if (BUILD_C_BINDINGS) +# add_custom_target(flann_ruby_spec +# COMMAND bundle exec rake spec +# WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/src/ruby) +# add_dependencies(test flann_ruby_spec) +#endif() diff --git a/test/flann_autotuned_test.cpp b/test/flann_autotuned_test.cpp index 28f4fc79..b48f0035 100644 --- a/test/flann_autotuned_test.cpp +++ b/test/flann_autotuned_test.cpp @@ -27,9 +27,9 @@ class Autotuned_SIFT100K : public FLANNTestFixture { indices = flann::Matrix(new size_t[1000*5], 1000, 5); printf("Reading test data..."); fflush(stdout); - flann::load_from_file(data, "sift100K.h5","dataset"); - flann::load_from_file(query,"sift100K.h5","query"); - flann::load_from_file(match,"sift100K.h5","match"); + flann::load_from_file(data, "../datasets/sift100K.h5","dataset"); + flann::load_from_file(query,"../datasets/sift100K.h5","query"); + flann::load_from_file(match,"../datasets/sift100K.h5","match"); printf("done\n"); } diff --git a/test/flann_cuda_test.cu b/test/flann_cuda_test.cu index 30450002..d0aca0ba 100644 --- a/test/flann_cuda_test.cu +++ b/test/flann_cuda_test.cu @@ -112,9 +112,9 @@ protected: { printf("Reading test data..."); fflush(stdout); - flann::load_from_file(data, "cloud.h5","dataset"); - flann::load_from_file(query,"cloud.h5","query"); - flann::load_from_file(match,"cloud.h5","indices"); + flann::load_from_file(data, "../datasets/cloud.h5","dataset"); + flann::load_from_file(query,"../datasets/cloud.h5","query"); + flann::load_from_file(match,"../datasets/cloud.h5","match"); dists = flann::Matrix(new float[query.rows*5], query.rows, 5); indices = flann::Matrix(new int[query.rows*5], query.rows, 5); diff --git a/test/flann_hierarchical_test.cpp b/test/flann_hierarchical_test.cpp index f3b8adbf..f4f6624b 100644 --- a/test/flann_hierarchical_test.cpp +++ b/test/flann_hierarchical_test.cpp @@ -28,8 +28,8 @@ class HierarchicalIndex_Brief100K : public FLANNTestFixture k_nn_ = 3; printf("Reading test data..."); fflush(stdout); - flann::load_from_file(data, "brief100K.h5", "dataset"); - flann::load_from_file(query, "brief100K.h5", "query"); + flann::load_from_file(data, "../datasets/brief100K.h5", "dataset"); + flann::load_from_file(query, "../datasets/brief100K.h5", "query"); printf("done\n"); flann::Index index(data, flann::LinearIndexParams()); diff --git a/test/flann_kdtree_single_test.cpp b/test/flann_kdtree_single_test.cpp index feba4d29..aaf8513f 100644 --- a/test/flann_kdtree_single_test.cpp +++ b/test/flann_kdtree_single_test.cpp @@ -10,7 +10,7 @@ using namespace flann; class KDTreeSingle :public DatasetTestFixture { protected: - KDTreeSingle() : DatasetTestFixture("cloud.h5") {} + KDTreeSingle() : DatasetTestFixture("../datasets/cloud.h5") {} }; TEST_F(KDTreeSingle, TestSearch) @@ -29,7 +29,7 @@ TEST_F(KDTreeSingle, TestSearch2) TEST_F(KDTreeSingle, TestSearchPadded) { flann::Matrix data_padded; - flann::load_from_file(data_padded, "cloud.h5", "dataset_padded"); + flann::load_from_file(data_padded, "../datasets/cloud.h5", "dataset_padded"); flann::Matrix data2(data_padded.ptr(), data_padded.rows, 3, data_padded.cols*sizeof(float)); TestSearch >(data2, flann::KDTreeSingleIndexParams(12, false), diff --git a/test/flann_kdtree_test.cpp b/test/flann_kdtree_test.cpp index 17bf91db..971aa3a6 100644 --- a/test/flann_kdtree_test.cpp +++ b/test/flann_kdtree_test.cpp @@ -13,7 +13,7 @@ using namespace flann; */ class KDTree_SIFT10K : public DatasetTestFixture { protected: - KDTree_SIFT10K() : DatasetTestFixture("sift10K.h5") {} + KDTree_SIFT10K() : DatasetTestFixture("../datasets/sift10K.h5") {} }; TEST_F(KDTree_SIFT10K, TestSearch) @@ -72,7 +72,7 @@ TEST_F(KDTree_SIFT10K, TestCopy2) */ class KDTree_SIFT100K : public DatasetTestFixture { protected: - KDTree_SIFT100K() : DatasetTestFixture("sift100K.h5") {} + KDTree_SIFT100K() : DatasetTestFixture("../datasets/sift100K.h5") {} }; @@ -110,7 +110,7 @@ TEST_F(KDTree_SIFT100K, TestRemove) */ class KDTree_SIFT10K_byte : public DatasetTestFixture { protected: - KDTree_SIFT10K_byte() : DatasetTestFixture("sift10K_byte.h5") {} + KDTree_SIFT10K_byte() : DatasetTestFixture("../datasets/sift10K_byte.h5") {} }; @@ -123,7 +123,7 @@ TEST_F(KDTree_SIFT10K_byte, TestSearch) class KDTree_SIFT100K_byte : public DatasetTestFixture { protected: - KDTree_SIFT100K_byte() : DatasetTestFixture("sift100K_byte.h5") {} + KDTree_SIFT100K_byte() : DatasetTestFixture("../datasets/sift100K_byte.h5") {} }; diff --git a/test/flann_kmeans_test.cpp b/test/flann_kmeans_test.cpp index 47b01de3..86ceae0b 100644 --- a/test/flann_kmeans_test.cpp +++ b/test/flann_kmeans_test.cpp @@ -13,7 +13,7 @@ using namespace flann; */ class KMeans_SIFT10K : public DatasetTestFixture { protected: - KMeans_SIFT10K() : DatasetTestFixture("sift10K.h5") {} + KMeans_SIFT10K() : DatasetTestFixture("../datasets/sift10K.h5") {} }; @@ -77,7 +77,7 @@ TEST_F(KMeans_SIFT10K, TestCopy2) */ class KMeans_SIFT100K : public DatasetTestFixture { protected: - KMeans_SIFT100K() : DatasetTestFixture("sift100K.h5") {} + KMeans_SIFT100K() : DatasetTestFixture("../datasets/sift100K.h5") {} }; @@ -120,7 +120,7 @@ TEST_F(KMeans_SIFT100K, TestSave) */ class KMeans_SIFT10K_byte : public DatasetTestFixture { protected: - KMeans_SIFT10K_byte() : DatasetTestFixture("sift10K_byte.h5") {} + KMeans_SIFT10K_byte() : DatasetTestFixture("../datasets/sift10K_byte.h5") {} }; TEST_F(KMeans_SIFT10K_byte, TestSearch) @@ -133,7 +133,7 @@ TEST_F(KMeans_SIFT10K_byte, TestSearch) class KMeans_SIFT100K_byte : public DatasetTestFixture { protected: - KMeans_SIFT100K_byte() : DatasetTestFixture("sift100K_byte.h5") {} + KMeans_SIFT100K_byte() : DatasetTestFixture("../datasets/sift100K_byte.h5") {} }; TEST_F(KMeans_SIFT100K_byte, TestSearch) diff --git a/test/flann_linear_test.cpp b/test/flann_linear_test.cpp index 4f6ea2a6..3da9511d 100644 --- a/test/flann_linear_test.cpp +++ b/test/flann_linear_test.cpp @@ -13,7 +13,7 @@ using namespace flann; */ class Linear_SIFT10K : public DatasetTestFixture { protected: - Linear_SIFT10K() : DatasetTestFixture("sift10K.h5") {} + Linear_SIFT10K() : DatasetTestFixture("../datasets/sift10K.h5") {} }; @@ -61,7 +61,7 @@ TEST_F(Linear_SIFT10K, TestCopy2) */ class Linear_SIFT100K : public DatasetTestFixture { protected: - Linear_SIFT100K() : DatasetTestFixture("sift100K.h5") {} + Linear_SIFT100K() : DatasetTestFixture("../datasets/sift100K.h5") {} }; @@ -77,7 +77,7 @@ TEST_F(Linear_SIFT100K, TestSearch) */ class Linear_SIFT10K_byte : public DatasetTestFixture { protected: - Linear_SIFT10K_byte() : DatasetTestFixture("sift10K_byte.h5") {} + Linear_SIFT10K_byte() : DatasetTestFixture("../datasets/sift10K_byte.h5") {} }; @@ -93,7 +93,7 @@ TEST_F(Linear_SIFT10K_byte, Linear) class Linear_SIFT100K_byte : public DatasetTestFixture { protected: - Linear_SIFT100K_byte() : DatasetTestFixture("sift100K_byte.h5") {} + Linear_SIFT100K_byte() : DatasetTestFixture("../datasets/sift100K_byte.h5") {} }; diff --git a/test/flann_lsh_test.cpp b/test/flann_lsh_test.cpp index 11a797e5..2f5a4887 100644 --- a/test/flann_lsh_test.cpp +++ b/test/flann_lsh_test.cpp @@ -28,8 +28,8 @@ class LshIndex_Brief100K : public FLANNTestFixture k_nn_ = 3; printf("Reading test data..."); fflush(stdout); - flann::load_from_file(data, "brief100K.h5", "dataset"); - flann::load_from_file(query, "brief100K.h5", "query"); + flann::load_from_file(data, "../datasets/brief100K.h5", "dataset"); + flann::load_from_file(query, "../datasets/brief100K.h5", "query"); dists = flann::Matrix(new DistanceType[query.rows * k_nn_], query.rows, k_nn_); indices = flann::Matrix(new size_t[query.rows * k_nn_], query.rows, k_nn_); @@ -37,7 +37,7 @@ class LshIndex_Brief100K : public FLANNTestFixture printf("done\n"); // The matches are bogus so we compute them the hard way - // flann::load_from_file(match,"brief100K.h5","indices"); + // flann::load_from_file(match,"../datasets/brief100K.h5","indices"); flann::Index index(data, flann::LinearIndexParams()); index.buildIndex(); diff --git a/test/flann_multithreaded_test.cpp b/test/flann_multithreaded_test.cpp index 4d412e4f..1cd76145 100644 --- a/test/flann_multithreaded_test.cpp +++ b/test/flann_multithreaded_test.cpp @@ -68,9 +68,9 @@ class FlannTest : public FLANNTestFixture { printf("Reading test data..."); fflush(stdout); - flann::load_from_file(data_, "cloud.h5","dataset"); - flann::load_from_file(query_,"cloud.h5","query"); - flann::load_from_file(match_,"cloud.h5","match"); + flann::load_from_file(data_, "../datasets/cloud.h5","dataset"); + flann::load_from_file(query_,"../datasets/cloud.h5","query"); + flann::load_from_file(match_,"../datasets/cloud.h5","match"); dists_ = flann::Matrix(new float[query_.rows*knn_], query_.rows, knn_); indices_ = flann::Matrix(new size_t[query_.rows*knn_], query_.rows, knn_); @@ -155,8 +155,8 @@ class FlannCompareKnnTest : public FLANNTestFixture { printf("Reading test data..."); fflush(stdout); - flann::load_from_file(data_, "cloud.h5","dataset"); - flann::load_from_file(query_,"cloud.h5","query"); + flann::load_from_file(data_, "../datasets/cloud.h5","dataset"); + flann::load_from_file(query_,"../datasets/cloud.h5","query"); dists_single_ = flann::Matrix(new float[query_.rows*knn_], query_.rows, knn_); indices_single_ = flann::Matrix(new size_t[query_.rows*knn_], query_.rows, knn_); @@ -229,8 +229,8 @@ class FlannCompareRadiusTest : public FLANNTestFixture { printf("Reading test data..."); fflush(stdout); - flann::load_from_file(data_, "cloud.h5","dataset"); - flann::load_from_file(query_,"cloud.h5","query"); + flann::load_from_file(data_, "../datasets/cloud.h5","dataset"); + flann::load_from_file(query_,"../datasets/cloud.h5","query"); int reserve_size = data_.rows / 1000;