Add codecs for dealing with pgsparse vector. (#478)

vpetrovykh · aljazerzen · commit 11739ed283cd · 2024-02-15T09:32:53.000+01:00
Add codecs for converting to/from regular arrays to sparse vectors.
diff --git a/edgedb/protocol/codecs/codecs.pyx b/edgedb/protocol/codecs/codecs.pyx
@@ -798,6 +798,113 @@ cdef pgvector_decode(pgproto.CodecContext settings, FRBuffer *buf):
     return val
 
 
+# The pg_sparse extension uses a signed int16 when reading dimesion in binary
+# format.
+DEF PGSPARSE_MAX_DIM = (1 << 15) - 1
+
+
+cdef pgsparse_encode(pgproto.CodecContext settings, WriteBuffer buf,
+                     object obj):
+    cdef:
+        int16_t n_elem = 0
+        int64_t dim
+        Py_ssize_t i
+        float[:] memview
+
+    # If we can take a typed memview of the object, we use that.
+    # That is good, because it means we can consume array.array and
+    # numpy.ndarray without needing to unbox.
+    # Otherwise we take the slow path, indexing into the array using
+    # the normal protocol.
+    try:
+        memview = obj
+    except (ValueError, TypeError) as e:
+        pass
+    else:
+        # The actual dimentionality of the vector is the size of the raw array
+        dim = len(memview)
+        if dim > PGSPARSE_MAX_DIM:
+            raise ValueError('too many elements in vector value')
+
+        # First pass to count the number of non-zero elements
+        for i in range(dim):
+            if memview[i] != 0:
+                n_elem += 1
+
+        buf.write_int32(6 + n_elem*8)
+        buf.write_int16(n_elem)
+        buf.write_int16(<int16_t>dim)
+        buf.write_int16(0)
+        # Second pass will write the actual non-zero elements
+        for i in range(dim):
+            if memview[i] != 0:
+                buf.write_int32(i)
+                buf.write_float(memview[i])
+        return
+
+    # Annoyingly, this is literally identical code to the fast path...
+    # but the types are different in critical ways.
+    if not _is_array_iterable(obj):
+        raise TypeError(
+            'a sized iterable container expected (got type {!r})'.format(
+                type(obj).__name__))
+
+    # The actual dimentionality of the vector is the size of the raw array
+    dim = len(obj)
+    if dim > PGSPARSE_MAX_DIM:
+        raise ValueError('too many elements in vector value')
+
+    # First pass to count the number of non-zero elements
+    for i in range(dim):
+        if obj[i] != 0:
+            n_elem += 1
+
+    buf.write_int32(6 + n_elem*8)
+    buf.write_int16(n_elem)
+    buf.write_int16(dim)
+    buf.write_int16(0)
+    # Second pass will write the actual non-zero elements
+    for i in range(dim):
+        if obj[i] != 0:
+            buf.write_int32(i)
+            buf.write_float(obj[i])
+
+
+cdef pgsparse_decode(pgproto.CodecContext settings, FRBuffer *buf):
+    cdef:
+        int16_t n_elem
+        int16_t dim
+        Py_ssize_t i
+        int32_t index
+        float[::1] array_view
+
+    n_elem = hton.unpack_int16(frb_read(buf, 2))
+    dim = hton.unpack_int16(frb_read(buf, 2))
+    frb_read(buf, 2)
+
+    # Create a float array with size dim
+    val = ONE_EL_ARRAY * dim
+    array_view = val
+
+    # The underlying sparse Vector representation supports int32 as the
+    # dimension and index, but when converting to binary format the dimensions
+    # are maxed out at int16. So indexes beyond the truncated dimension will
+    # cause an exception.
+    if dim < 0:
+        # This is actually an indicator of overflow when converting from int32
+        # down to int16.
+        raise ValueError('too many elements in vector value')
+    try:
+        # Fill the non-zero elements
+        for i in range(n_elem):
+            index = hton.unpack_int32(frb_read(buf, 4))
+            array_view[index] = hton.unpack_float(frb_read(buf, 4))
+    except IndexError:
+        raise ValueError('too many elements in vector value')
+
+    return val
+
+
 cdef checked_decimal_encode(
     pgproto.CodecContext settings, WriteBuffer buf, obj
 ):
@@ -1007,5 +1114,12 @@ cdef register_base_scalar_codecs():
         uuid.UUID('9565dd88-04f5-11ee-a691-0b6ebe179825'),
     )
 
+    register_base_scalar_codec(
+        'ext::pgsparse::vector',
+        pgsparse_encode,
+        pgsparse_decode,
+        uuid.UUID('b646ace0-266d-47ce-8263-1224c38a4a12'),
+    )
+
 
 register_base_scalar_codecs()
diff --git a/tests/test_vector.py b/tests/test_vector.py
@@ -129,3 +129,119 @@ async def test_vector_01(self):
                 ''',
                 'foo',
             )
+
+
+class TestSparseVector(tb.SyncQueryTestCase):
+    def setUp(self):
+        super().setUp()
+
+        if not self.client.query_required_single('''
+            select exists (
+              select sys::ExtensionPackage filter .name = 'pgsparse'
+            )
+        '''):
+            self.skipTest("feature not implemented")
+
+        self.client.execute('''
+            create extension pgsparse;
+        ''')
+
+    def tearDown(self):
+        try:
+            self.client.execute('''
+                drop extension pgsparse;
+            ''')
+        finally:
+            super().tearDown()
+
+    async def test_vector_01(self):
+        val = self.client.query_single('''
+            select <ext::pgsparse::vector>[1.5,0,0,0,2.0,3.8]
+        ''')
+        self.assertTrue(isinstance(val, array.array))
+        self.assertEqual(val, array.array('f', [1.5, 0, 0, 0, 2.0, 3.8]))
+
+        val = self.client.query_single(
+            '''
+                select <json><ext::pgsparse::vector>$0
+            ''',
+            [3.0, 9.0, -42.5],
+        )
+        self.assertEqual(val, '[3, 9, -42.5]')
+
+        val = self.client.query_single(
+            '''
+                select <json><ext::pgsparse::vector>$0
+            ''',
+            array.array('f', [3.0, 9.0, -42.5])
+        )
+        self.assertEqual(val, '[3, 9, -42.5]')
+
+        val = self.client.query_single(
+            '''
+                select <json><ext::pgsparse::vector>$0
+            ''',
+            array.array('i', [1, 2, 3]),
+        )
+        self.assertEqual(val, '[1, 2, 3]')
+
+        val = self.client.query_single(
+            '''
+                select <ext::pgsparse::vector>$0
+            ''',
+            array.array('f', ([0] * 10000) + [1, 2]),
+        )
+        self.assertEqual(val, array.array('f', ([0] * 10000) + [1, 2]))
+
+        val = self.client.query_single(
+            '''
+                with zeros := array_agg(
+                    (for x in range_unpack(range(0, 20000)) union 0)
+                )
+                select <ext::pgsparse::vector>(zeros ++ [1, 2]);
+            ''',
+        )
+        self.assertEqual(val, array.array('f', ([0] * 20000) + [1, 2]))
+
+        # Some sad path tests
+        with self.assertRaises(edgedb.InvalidArgumentError):
+            self.client.query_single(
+                '''
+                    select <ext::pgsparse::vector>$0
+                ''',
+                [3.0, None, -42.5],
+            )
+
+        with self.assertRaises(edgedb.InvalidArgumentError):
+            self.client.query_single(
+                '''
+                    select <ext::pgsparse::vector>$0
+                ''',
+                [3.0, 'x', -42.5],
+            )
+
+        with self.assertRaises(edgedb.InvalidArgumentError):
+            self.client.query_single(
+                '''
+                    select <ext::pgsparse::vector>$0
+                ''',
+                'foo',
+            )
+
+        with self.assertRaises(edgedb.InvalidArgumentError):
+            self.client.query_single(
+                '''
+                    select <ext::pgsparse::vector>$0
+                ''',
+                array.array('f', ([0] * 50000) + [1, 2]),
+            )
+
+        with self.assertRaises(edgedb.ClientError):
+            self.client.query_single(
+                '''
+                    with zeros := array_agg(
+                        (for x in range_unpack(range(0, 50000)) union 0)
+                    )
+                    select <ext::pgsparse::vector>(zeros ++ [1, 2]);
+                ''',
+            )