fixed issue with empty index and non-zero roots size, align fixes

ViperCraft · May 30, 2023 · fde86b2 · fde86b2
1 parent 0ba04a7
commit fde86b2
Show file tree

Hide file tree

Showing 3 changed files with 32 additions and 17 deletions.
diff --git a/examples/packed_index_test.cpp b/examples/packed_index_test.cpp
@@ -282,15 +282,15 @@ int main(int argc, char **argv) {
     {
         // DotProduct
         test<DotProductPacked16>(256, 256, 100000);
-        test<DotProductPacked16>(64, 128, 1000000);
+        test<DotProductPacked16>(64, 64, 1000000);
         // and hard case for avx, causes a split
-        test<DotProductPacked16>(40, 64, 100000);
+        test<DotProductPacked16>(40, 40, 100000);
         // Euclidean
         test<EuclideanPacked16>(256, 256, 100000);
-        test<EuclideanPacked16>(64, 128, 1000000);
+        test<EuclideanPacked16>(64, 64, 1000000);
         // and hard case for avx, causes a split
-        test<EuclideanPacked16>(40, 64, 100000);
-        CHECK_AND_THROW( in_mem_test(64, 128, 100000) > 0.9 );
+        test<EuclideanPacked16>(40, 40, 100000);
+        CHECK_AND_THROW( in_mem_test(64, 64, 100000) > 0.9 );
         // in the case we try to make very small index
         CHECK_AND_THROW( in_mem_test(64, 64, 17) >= 0.25 );
         // edge cases

diff --git a/src/packedlib.h b/src/packedlib.h
@@ -26,7 +26,7 @@
 #include <assert.h>
 
 #ifdef __GNUC__
-#  define alloca_aligned(sz) static_cast<char*>(__builtin_alloca_with_align(sz, 16))
+#  define alloca_aligned(sz) static_cast<char*>(__builtin_alloca_with_align(sz, 64))
 #else
 /* Clang must be generated already aligned stack allocation */
 #  define alloca_aligned(sz) static_cast<char*>(alloca(sz))
@@ -91,7 +91,10 @@ namespace detail {
     MMapWriter& operator == ( MMapWriter const & ) = delete;
     bool open( char const */*filename*/, size_t calculated_size )
     {
-      void *p = mmap(0, calculated_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
+
+      void *p = calculated_size ? mmap(0, calculated_size, PROT_READ | PROT_WRITE,
+                                      MAP_PRIVATE | MAP_ANONYMOUS, 0, 0)
+                                : nullptr;
       if( p == MAP_FAILED )
         return false;
 
@@ -101,7 +104,8 @@ namespace detail {
 
 #if defined(MADV_DONTDUMP)
       // Exclude from a core dump those pages
-      madvise(p, calculated_size, MADV_DONTDUMP);
+      if (p != nullptr)
+        madvise(p, calculated_size, MADV_DONTDUMP);
 #endif
       return true;
     }
@@ -227,18 +231,27 @@ class PackedAnnoyIndexer {
 
     _n_nodes = _n_items;
     while (1) {
-      if (q == -1 && _n_nodes >= _n_items * 2)
-        break;
-      if (q != -1 && _roots.size() >= (size_t)q)
+      if (q == -1)
+      {
+        if (_n_nodes >= _n_items * 2)
+          break;
+      }
+      else if (_roots.size() >= (size_t)q)
         break;
+
       if (_verbose) annoylib_showUpdate("pass %zd...\n", _roots.size());
 
+
       vector<S> indices;
       for (S i = 0; i < _n_items; i++) {
           if (_get(i)->n_descendants >= 1) // Issue #223
           indices.push_back(i);
       }
 
+      // cannot make roots w/o items
+      if( indices.empty() )
+        break;
+
       _roots.push_back(_make_tree(indices, true));
     }
 
@@ -436,7 +449,7 @@ class PackedAnnoyIndexer {
     S const max_n_descendants = _K - 1;
 
     if (isz <= max_n_descendants && (!is_root || (size_t)_n_items <= (size_t)max_n_descendants || isz == 1)) {
-      if( !is_root )
+      if (!is_root)
         // only non-roots can have indices only nodes!
         return _append_indices(indices);
 
@@ -598,6 +611,8 @@ class PackedAnnoySearcher {
 
     size_t n_nodes = (S)((_mapping.size - sizeof_indices) / _s);
 
+
+
     // Find the roots by scanning the end of the file and taking the nodes with most descendants
     std::vector<S> roots;
     roots.clear();
@@ -623,7 +638,7 @@ class PackedAnnoySearcher {
       _roots_q.emplace_back(Distance::template pq_initial_value<T>(), r);
     }
     std::make_heap(_roots_q.begin(), _roots_q.end());
-    _n_items = m;
+    _n_items = m != -1 ? m : 0;
 
     return true;
   }

diff --git a/src/packutils.h b/src/packutils.h
@@ -105,11 +105,11 @@ float decode_and_dot_i16_f32( uint16_t const *__restrict__ in, float const *__re
       __m256i s  = _mm256_lddqu_si256( (__m256i const*)(in) );
       __m256i ai = _mm256_srai_epi32(_mm256_unpacklo_epi16(s, s), 16);
       __m256 a = _mm256_mul_ps(_mm256_cvtepi32_ps(ai), mm1);
-      mx = _mm256_load_ps(y);
+      mx = _mm256_loadu_ps(y);
       __m256i bi = _mm256_srai_epi32(_mm256_unpackhi_epi16(s, s), 16);
       msum1 = _mm256_add_ps (msum1, _mm256_mul_ps (a, mx));
       __m256 b = _mm256_mul_ps(_mm256_cvtepi32_ps(bi), mm1);
-      my = _mm256_load_ps(y + 8);
+      my = _mm256_loadu_ps(y + 8);
       msum2 = _mm256_add_ps (msum2, _mm256_mul_ps (b, my));
       in += 16;
       y += 16;
@@ -159,12 +159,12 @@ inline float decode_and_euclidean_distance_i16_f32( uint16_t const *__restrict__
       __m256i s  = _mm256_lddqu_si256( (__m256i const*)(in) );
       __m256i ai = _mm256_srai_epi32(_mm256_unpacklo_epi16(s, s), 16);
       __m256 a = _mm256_mul_ps(_mm256_cvtepi32_ps(ai), mm1);
-      mx = _mm256_load_ps(y);
+      mx = _mm256_loadu_ps(y);
       __m256i bi = _mm256_srai_epi32(_mm256_unpackhi_epi16(s, s), 16);
       __m256 d1 = _mm256_sub_ps (a, mx);
       msum1 = _mm256_add_ps (msum1, _mm256_mul_ps (d1, d1));
       __m256 b = _mm256_mul_ps(_mm256_cvtepi32_ps(bi), mm1);
-      my = _mm256_load_ps(y + 8);
+      my = _mm256_loadu_ps(y + 8);
       __m256 d2 = _mm256_sub_ps (b, my);
       msum2 = _mm256_add_ps (msum2, _mm256_mul_ps (d2, d2));
       in += 16;