From 6d7d792e536ff1d15cf5344787c11227f0010572 Mon Sep 17 00:00:00 2001 From: thodson Date: Fri, 2 Feb 2024 11:23:07 -0600 Subject: [PATCH 01/15] Draft bitinfo codec --- numcodecs/bitinfo.py | 263 ++++++++++++++++++++++++++++++++++++++++++ numcodecs/bitround.py | 14 ++- 2 files changed, 273 insertions(+), 4 deletions(-) create mode 100644 numcodecs/bitinfo.py diff --git a/numcodecs/bitinfo.py b/numcodecs/bitinfo.py new file mode 100644 index 00000000..51049c84 --- /dev/null +++ b/numcodecs/bitinfo.py @@ -0,0 +1,263 @@ +import numpy as np + +from .compat import ensure_ndarray_like +from .bitround import BitRound + +# The size in bits of the mantissa/significand for the various floating types +# You cannot keep more bits of data than you have available +# https://en.wikipedia.org/wiki/IEEE_754 + +NMBITS = {64: 12, 32: 9, 16: 6} # number of non mantissa bits for given dtype + +class BitInfo(BitRound): + """Floating-point bit information codec + + Drops bits from the floating point mantissa, leaving an array more amenable + to compression. The number of bits to keep is determined using the approach + from Klöwer et al. 2021 (https://www.nature.com/articles/s43588-021-00156-2). + See https://github.com/zarr-developers/numcodecs/issues/298 for discussion + and the original implementation in Julia referred to at + https://github.com/milankl/BitInformation.jl + + Parameters + ---------- + + inflevel: float + The number of bits of the mantissa to keep. The range allowed + depends on the dtype input data. If keepbits is + equal to the maximum allowed for the data type, this is equivalent + to no transform. + + axes: int or list of int, optional + Axes along which to calculate the bit information. If None, all axes + are used. + """ + + codec_id = 'bitinfo' + + def __init__(self, inflevel: float, axes=None): + if (inflevel < 0) or (inflevel > 1.0): + raise ValueError("Please provide `inflevel` from interval [0.,1.]") + + self.inflevel = inflevel + self.axes = axes + + def encode(self, buf): + """Create int array by rounding floating-point data + + The itemsize will be preserved, but the output should be much more + compressible. + """ + a = ensure_ndarray_like(buf) + if not a.dtype.kind == "f" or a.dtype.itemsize > 8: + raise TypeError("Only float arrays (16-64bit) can be bit-rounded") + + if self.axes is None: + axes = range(a.ndim) + + itemsize = a.dtype.itemsize + astype = f"u{itemsize}" + if a.dtype in (np.float16, np.float32, np.float64): + a = signed_exponent(a) + + a = a.astype(astype) + keepbits = [] + + for ax in axes: + info_per_bit = bitinformation(a, axis=ax) + keepbits.append(get_keepbits(info_per_bit, self.inflevel)) + + keepbits = max(keepbits) + + return BitRound._bitround(a, keepbits) + + +def exponent_bias(dtype): + """ + Returns the exponent bias for a given floating-point dtype. + + Example + ------- + >>> exponent_bias("f4") + 127 + >>> exponent_bias("f8") + 1023 + """ + info = np.finfo(dtype) + exponent_bits = info.bits - info.nmant - 1 + return 2 ** (exponent_bits - 1) - 1 + + +def exponent_mask(dtype): + """ + Returns exponent mask for a given floating-point dtype. + + Example + ------- + >>> np.binary_repr(exponent_mask(np.float32), width=32) + '01111111100000000000000000000000' + >>> np.binary_repr(exponent_mask(np.float16), width=16) + '0111110000000000' + """ + if dtype == np.float16: + mask = 0x7C00 + elif dtype == np.float32: + mask = 0x7F80_0000 + elif dtype == np.float64: + mask = 0x7FF0_0000_0000_0000 + return mask + + +def signed_exponent(A): + """ + Transform biased exponent notation to signed exponent notation. + + Parameters + ---------- + A : :py:class:`numpy.array` + Array to transform + + Returns + ------- + B : :py:class:`numpy.array` + + Example + ------- + >>> A = np.array(0.03125, dtype="float32") + >>> np.binary_repr(A.view("uint32"), width=32) + '00111101000000000000000000000000' + >>> np.binary_repr(signed_exponent(A), width=32) + '01000010100000000000000000000000' + >>> A = np.array(0.03125, dtype="float64") + >>> np.binary_repr(A.view("uint64"), width=64) + '0011111110100000000000000000000000000000000000000000000000000000' + >>> np.binary_repr(signed_exponent(A), width=64) + '0100000001010000000000000000000000000000000000000000000000000000' + """ + itemsize = A.dtype.itemsize + uinttype = f"u{itemsize}" + inttype = f"i{itemsize}" + + sign_mask = 1 << np.finfo(A.dtype).bits - 1 + sfmask = sign_mask | (1 << np.finfo(A.dtype).nmant) - 1 + emask = exponent_mask(A.dtype) + esignmask = sign_mask >> 1 + + sbits = np.finfo(A.dtype).nmant + if itemsize == 8: + sbits = np.uint64(sbits) + emask = np.uint64(emask) + bias = exponent_bias(A.dtype) + + ui = A.view(uinttype) + sf = ui & sfmask + e = ((ui & emask) >> sbits).astype(inttype) - bias + max_eabs = np.iinfo(A.view(uinttype).dtype).max >> sbits + eabs = abs(e) % (max_eabs + 1) + esign = np.where(e < 0, esignmask, 0) + if itemsize == 8: + eabs = np.uint64(eabs) + esign = np.uint64(esign) + esigned = esign | (eabs << sbits) + B = (sf | esigned).view(np.int64) + return B + + +def bitpaircount_u1(a, b): + assert a.dtype == "u1" + assert b.dtype == "u1" + unpack_a = np.unpackbits(a.flatten()).astype("u1") + unpack_b = np.unpackbits(b.flatten()).astype("u1") + + index = ((unpack_a << 1) | unpack_b).reshape(-1, 8) + + selection = np.array([0, 1, 2, 3], dtype="u1") + sel = np.where((index[..., np.newaxis]) == selection, True, False) + return sel.sum(axis=0).reshape([8, 2, 2]) + + +def bitpaircount(a, b): + assert a.dtype.kind == "u" + assert b.dtype.kind == "u" + nbytes = max(a.dtype.itemsize, b.dtype.itemsize) + + a, b = np.broadcast_arrays(a, b) + + bytewise_counts = [] + for i in range(nbytes): + s = (nbytes - 1 - i) * 8 + bitc = bitpaircount_u1((a >> s).astype("u1"), (b >> s).astype("u1")) + bytewise_counts.append(bitc) + return np.concatenate(bytewise_counts, axis=0) + + +def mutual_information(a, b, base=2): + """Calculate the mutual information between two arrays. + """ + size = np.prod(np.broadcast_shapes(a.shape, b.shape)) + counts = bitpaircount(a, b) + + p = counts.astype("float") / size + p = np.ma.masked_equal(p, 0) + pr = p.sum(axis=-1)[..., np.newaxis] + ps = p.sum(axis=-2)[..., np.newaxis, :] + mutual_info = (p * np.ma.log(p / (pr * ps))).sum(axis=(-1, -2)) / np.log(base) + return mutual_info + + +def bitinformation(a, axis=0): + """Get the information content of each bit in the array. + + Parameters + ---------- + a : array + Array to calculate the bit information. + axis : int + Axis along which to calculate the bit information. + + Returns + ------- + info_per_bit : array + """ + sa = tuple(slice(0, -1) if i == axis else slice(None) for i in range(len(a.shape))) + sb = tuple( + slice(1, None) if i == axis else slice(None) for i in range(len(a.shape)) + ) + return mutual_information(a[sa], a[sb]) + + +def get_keepbits(info_per_bit, inflevel=0.99): + """Get the number of mantissa bits to keep. + + Parameters + ---------- + info_per_bit : array + Information content of each bit from `get_bitinformation`. + + inflevel : float + Level of information that shall be preserved. + + Returns + ------- + keepbits : int + Number of mantissa bits to keep + + """ + if (inflevel < 0) or (inflevel > 1.0): + raise ValueError("Please provide `inflevel` from interval [0.,1.]") + + cdf = _cdf_from_info_per_bit(info_per_bit) + bitdim_non_mantissa_bits = NMBITS[len(info_per_bit)] + keepmantissabits = ( + (cdf > inflevel).argmax() + 1 - bitdim_non_mantissa_bits + ) + + return keepmantissabits + + +def _cdf_from_info_per_bit(info_per_bit): + """Convert info_per_bit to cumulative distribution function""" + tol = info_per_bit[-4:].max() * 1.5 + info_per_bit[info_per_bit < tol] = 0 + cdf = info_per_bit.cumsum() + return cdf / cdf[-1] diff --git a/numcodecs/bitround.py b/numcodecs/bitround.py index 767e5e43..c3a626b4 100644 --- a/numcodecs/bitround.py +++ b/numcodecs/bitround.py @@ -54,14 +54,20 @@ def encode(self, buf): raise TypeError("Only float arrays (16-64bit) can be bit-rounded") bits = max_bits[str(a.dtype)] # cast float to int type of same width (preserve endianness) - a_int_dtype = np.dtype(a.dtype.str.replace("f", "i")) - all_set = np.array(-1, dtype=a_int_dtype) if self.keepbits == bits: return a if self.keepbits > bits: raise ValueError("Keepbits too large for given dtype") - b = a.view(a_int_dtype) - maskbits = bits - self.keepbits + + return self._bitround(a, self.keepbits) + + @staticmethod + def _bitround(buf, keepbits): + bits = max_bits[str(buf.dtype)] + a_int_dtype = np.dtype(buf.dtype.str.replace("f", "i")) + all_set = np.array(-1, dtype=a_int_dtype) + b = buf.view(a_int_dtype) + maskbits = bits - keepbits mask = (all_set >> maskbits) << maskbits half_quantum1 = (1 << (maskbits - 1)) - 1 b += ((b >> maskbits) & 1) + half_quantum1 From e35221b9e5f9516c025e10d0e6f4ecc19e66439f Mon Sep 17 00:00:00 2001 From: thodson Date: Fri, 2 Feb 2024 11:27:14 -0600 Subject: [PATCH 02/15] Fix PEP 8 issues --- numcodecs/bitinfo.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/numcodecs/bitinfo.py b/numcodecs/bitinfo.py index 51049c84..a999a00f 100644 --- a/numcodecs/bitinfo.py +++ b/numcodecs/bitinfo.py @@ -9,6 +9,7 @@ NMBITS = {64: 12, 32: 9, 16: 6} # number of non mantissa bits for given dtype + class BitInfo(BitRound): """Floating-point bit information codec @@ -233,7 +234,7 @@ def get_keepbits(info_per_bit, inflevel=0.99): ---------- info_per_bit : array Information content of each bit from `get_bitinformation`. - + inflevel : float Level of information that shall be preserved. @@ -241,10 +242,10 @@ def get_keepbits(info_per_bit, inflevel=0.99): ------- keepbits : int Number of mantissa bits to keep - + """ if (inflevel < 0) or (inflevel > 1.0): - raise ValueError("Please provide `inflevel` from interval [0.,1.]") + raise ValueError("Please provide `inflevel` from interval [0.,1.]") cdf = _cdf_from_info_per_bit(info_per_bit) bitdim_non_mantissa_bits = NMBITS[len(info_per_bit)] From a97f7e89fc989c2cb5adfd6afc589ac4e84b3995 Mon Sep 17 00:00:00 2001 From: thodson Date: Fri, 2 Feb 2024 12:27:43 -0600 Subject: [PATCH 03/15] Fixing bugs --- numcodecs/__init__.py | 3 +++ numcodecs/bitinfo.py | 4 +++- numcodecs/bitround.py | 6 +++--- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/numcodecs/__init__.py b/numcodecs/__init__.py index 47c2c616..2c50a30a 100644 --- a/numcodecs/__init__.py +++ b/numcodecs/__init__.py @@ -109,6 +109,9 @@ register_codec(Shuffle) +from numcodecs.bitinfo import BitInfo +register_codec(BitInfo) + from numcodecs.bitround import BitRound register_codec(BitRound) diff --git a/numcodecs/bitinfo.py b/numcodecs/bitinfo.py index a999a00f..0bdddb3d 100644 --- a/numcodecs/bitinfo.py +++ b/numcodecs/bitinfo.py @@ -50,6 +50,8 @@ def encode(self, buf): compressible. """ a = ensure_ndarray_like(buf) + dtype = a.dtype + if not a.dtype.kind == "f" or a.dtype.itemsize > 8: raise TypeError("Only float arrays (16-64bit) can be bit-rounded") @@ -70,7 +72,7 @@ def encode(self, buf): keepbits = max(keepbits) - return BitRound._bitround(a, keepbits) + return BitRound._bitround(a, keepbits, dtype) def exponent_bias(dtype): diff --git a/numcodecs/bitround.py b/numcodecs/bitround.py index c3a626b4..a6000e58 100644 --- a/numcodecs/bitround.py +++ b/numcodecs/bitround.py @@ -59,11 +59,11 @@ def encode(self, buf): if self.keepbits > bits: raise ValueError("Keepbits too large for given dtype") - return self._bitround(a, self.keepbits) + return self._bitround(a, self.keepbits, a.dtype) @staticmethod - def _bitround(buf, keepbits): - bits = max_bits[str(buf.dtype)] + def _bitround(buf, keepbits, dtype): + bits = max_bits[str(dtype)] a_int_dtype = np.dtype(buf.dtype.str.replace("f", "i")) all_set = np.array(-1, dtype=a_int_dtype) b = buf.view(a_int_dtype) From f8841131132e570ed7b6e00177086855bd0aef58 Mon Sep 17 00:00:00 2001 From: thodson Date: Fri, 2 Feb 2024 12:41:17 -0600 Subject: [PATCH 04/15] Bugfix; now it works --- numcodecs/bitinfo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/numcodecs/bitinfo.py b/numcodecs/bitinfo.py index 0bdddb3d..2b620d28 100644 --- a/numcodecs/bitinfo.py +++ b/numcodecs/bitinfo.py @@ -56,7 +56,7 @@ def encode(self, buf): raise TypeError("Only float arrays (16-64bit) can be bit-rounded") if self.axes is None: - axes = range(a.ndim) + self.axes = range(a.ndim) itemsize = a.dtype.itemsize astype = f"u{itemsize}" @@ -66,7 +66,7 @@ def encode(self, buf): a = a.astype(astype) keepbits = [] - for ax in axes: + for ax in self.axes: info_per_bit = bitinformation(a, axis=ax) keepbits.append(get_keepbits(info_per_bit, self.inflevel)) From c04a6e3f3a8eb59ffdc86550806dcea4e8755c3d Mon Sep 17 00:00:00 2001 From: thodson Date: Fri, 2 Feb 2024 22:03:27 -0600 Subject: [PATCH 05/15] Fix Codec.__init__ --- numcodecs/bitinfo.py | 41 ++++++++++++++++++++++++----------------- numcodecs/bitround.py | 26 +++++++++++++------------- 2 files changed, 37 insertions(+), 30 deletions(-) diff --git a/numcodecs/bitinfo.py b/numcodecs/bitinfo.py index 2b620d28..49459e13 100644 --- a/numcodecs/bitinfo.py +++ b/numcodecs/bitinfo.py @@ -23,11 +23,9 @@ class BitInfo(BitRound): Parameters ---------- - inflevel: float - The number of bits of the mantissa to keep. The range allowed - depends on the dtype input data. If keepbits is - equal to the maximum allowed for the data type, this is equivalent - to no transform. + info_level: float + The level of information to preserve in the data. The value should be + between 0. and 1.0. Higher values preserve more information. axes: int or list of int, optional Axes along which to calculate the bit information. If None, all axes @@ -36,13 +34,22 @@ class BitInfo(BitRound): codec_id = 'bitinfo' - def __init__(self, inflevel: float, axes=None): - if (inflevel < 0) or (inflevel > 1.0): - raise ValueError("Please provide `inflevel` from interval [0.,1.]") + def __init__(self, info_level: float, axes=None): + if (info_level < 0) or (info_level > 1.0): + raise ValueError("Please provide `info_level` from interval [0.,1.]") - self.inflevel = inflevel + elif axes is not None and not isinstance(axes, list): + if int(axes) != axes: + raise ValueError("axis must be an integer or a list of integers.") + axes = [axes] + + elif isinstance(axes, list) and not all(int(ax) == ax for ax in axes): + raise ValueError("axis must be an integer or a list of integers.") + + self.info_level = info_level self.axes = axes + def encode(self, buf): """Create int array by rounding floating-point data @@ -68,11 +75,11 @@ def encode(self, buf): for ax in self.axes: info_per_bit = bitinformation(a, axis=ax) - keepbits.append(get_keepbits(info_per_bit, self.inflevel)) + keepbits.append(get_keepbits(info_per_bit, self.info_level)) keepbits = max(keepbits) - return BitRound._bitround(a, keepbits, dtype) + return BitRound.bitround(buf, keepbits, dtype) def exponent_bias(dtype): @@ -117,12 +124,12 @@ def signed_exponent(A): Parameters ---------- - A : :py:class:`numpy.array` + a : array Array to transform Returns ------- - B : :py:class:`numpy.array` + array Example ------- @@ -162,8 +169,7 @@ def signed_exponent(A): eabs = np.uint64(eabs) esign = np.uint64(esign) esigned = esign | (eabs << sbits) - B = (sf | esigned).view(np.int64) - return B + return (sf | esigned).view(np.int64) def bitpaircount_u1(a, b): @@ -260,7 +266,8 @@ def get_keepbits(info_per_bit, inflevel=0.99): def _cdf_from_info_per_bit(info_per_bit): """Convert info_per_bit to cumulative distribution function""" - tol = info_per_bit[-4:].max() * 1.5 - info_per_bit[info_per_bit < tol] = 0 + # TODO this threshold isn't working yet + #tol = info_per_bit[-4:].max() * 1.5 + #info_per_bit[info_per_bit < tol] = 0 cdf = info_per_bit.cumsum() return cdf / cdf[-1] diff --git a/numcodecs/bitround.py b/numcodecs/bitround.py index a6000e58..ceed97d7 100644 --- a/numcodecs/bitround.py +++ b/numcodecs/bitround.py @@ -59,10 +59,21 @@ def encode(self, buf): if self.keepbits > bits: raise ValueError("Keepbits too large for given dtype") - return self._bitround(a, self.keepbits, a.dtype) + return self.bitround(a, self.keepbits, a.dtype) + + def decode(self, buf, out=None): + """Remake floats from ints + + As with ``encode``, preserves itemsize. + """ + buf = ensure_ndarray_like(buf) + # Cast back from `int` to `float` type (noop if a `float`ing type buffer is provided) + dt = np.dtype(buf.dtype.str.replace("i", "f")) + data = buf.view(dt) + return ndarray_copy(data, out) @staticmethod - def _bitround(buf, keepbits, dtype): + def bitround(buf, keepbits, dtype): bits = max_bits[str(dtype)] a_int_dtype = np.dtype(buf.dtype.str.replace("f", "i")) all_set = np.array(-1, dtype=a_int_dtype) @@ -73,14 +84,3 @@ def _bitround(buf, keepbits, dtype): b += ((b >> maskbits) & 1) + half_quantum1 b &= mask return b - - def decode(self, buf, out=None): - """Remake floats from ints - - As with ``encode``, preserves itemsize. - """ - buf = ensure_ndarray_like(buf) - # Cast back from `int` to `float` type (noop if a `float`ing type buffer is provided) - dt = np.dtype(buf.dtype.str.replace("i", "f")) - data = buf.view(dt) - return ndarray_copy(data, out) From 03128f6c2f25e0132c071eb50a396d56f6b41f96 Mon Sep 17 00:00:00 2001 From: thodson Date: Sat, 3 Feb 2024 18:42:48 -0600 Subject: [PATCH 06/15] Adjust tolerance but still needs fix --- numcodecs/bitinfo.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/numcodecs/bitinfo.py b/numcodecs/bitinfo.py index 49459e13..cd64058f 100644 --- a/numcodecs/bitinfo.py +++ b/numcodecs/bitinfo.py @@ -266,8 +266,8 @@ def get_keepbits(info_per_bit, inflevel=0.99): def _cdf_from_info_per_bit(info_per_bit): """Convert info_per_bit to cumulative distribution function""" - # TODO this threshold isn't working yet - #tol = info_per_bit[-4:].max() * 1.5 - #info_per_bit[info_per_bit < tol] = 0 + # TODO this threshold doesn't match implementation in remove_insignificant.jl + tol = info_per_bit[-4:].max() * 1.1 # reduced from 1.5 + info_per_bit[info_per_bit < tol] = 0 cdf = info_per_bit.cumsum() return cdf / cdf[-1] From 11ef603a2f06f922418f97ba3c95151d31f0af35 Mon Sep 17 00:00:00 2001 From: thodson Date: Mon, 5 Feb 2024 11:05:29 -0600 Subject: [PATCH 07/15] Add basic tests --- docs/bitinfo.rst | 10 +++++ numcodecs/bitinfo.py | 9 ++++ numcodecs/bitround.py | 18 +++++++- numcodecs/tests/test_bitinfo.py | 73 +++++++++++++++++++++++++++++++++ 4 files changed, 109 insertions(+), 1 deletion(-) create mode 100644 docs/bitinfo.rst create mode 100644 numcodecs/tests/test_bitinfo.py diff --git a/docs/bitinfo.rst b/docs/bitinfo.rst new file mode 100644 index 00000000..ed925056 --- /dev/null +++ b/docs/bitinfo.rst @@ -0,0 +1,10 @@ +PCodec +====== + +.. automodule:: numcodecs.bitinfo + +.. autoclass:: BitInfo + + .. autoattribute:: codec_id + .. automethod:: encode + .. automethod:: decode diff --git a/numcodecs/bitinfo.py b/numcodecs/bitinfo.py index cd64058f..8783a71c 100644 --- a/numcodecs/bitinfo.py +++ b/numcodecs/bitinfo.py @@ -115,6 +115,8 @@ def exponent_mask(dtype): mask = 0x7F80_0000 elif dtype == np.float64: mask = 0x7FF0_0000_0000_0000 + else: + raise ValueError(f"Unsupported dtype {dtype}") return mask @@ -175,6 +177,7 @@ def signed_exponent(A): def bitpaircount_u1(a, b): assert a.dtype == "u1" assert b.dtype == "u1" + unpack_a = np.unpackbits(a.flatten()).astype("u1") unpack_b = np.unpackbits(b.flatten()).astype("u1") @@ -188,6 +191,7 @@ def bitpaircount_u1(a, b): def bitpaircount(a, b): assert a.dtype.kind == "u" assert b.dtype.kind == "u" + nbytes = max(a.dtype.itemsize, b.dtype.itemsize) a, b = np.broadcast_arrays(a, b) @@ -203,6 +207,9 @@ def bitpaircount(a, b): def mutual_information(a, b, base=2): """Calculate the mutual information between two arrays. """ + assert a.dtype == b.dtype + assert a.dtype.kind == "u" + size = np.prod(np.broadcast_shapes(a.shape, b.shape)) counts = bitpaircount(a, b) @@ -228,6 +235,8 @@ def bitinformation(a, axis=0): ------- info_per_bit : array """ + assert a.dtype.kind == "u" + sa = tuple(slice(0, -1) if i == axis else slice(None) for i in range(len(a.shape))) sb = tuple( slice(1, None) if i == axis else slice(None) for i in range(len(a.shape)) diff --git a/numcodecs/bitround.py b/numcodecs/bitround.py index ceed97d7..117d7467 100644 --- a/numcodecs/bitround.py +++ b/numcodecs/bitround.py @@ -73,7 +73,23 @@ def decode(self, buf, out=None): return ndarray_copy(data, out) @staticmethod - def bitround(buf, keepbits, dtype): + def bitround(buf, keepbits: int, dtype): + """Drop bits from the mantissa of a floating point array + + Parameters + ---------- + buf: ndarray + The input array + keepbits: int + The number of bits to keep + dtype: dtype + The dtype of the input array + + Returns + ------- + ndarray + The bitrounded array transformed to an integer type + """ bits = max_bits[str(dtype)] a_int_dtype = np.dtype(buf.dtype.str.replace("f", "i")) all_set = np.array(-1, dtype=a_int_dtype) diff --git a/numcodecs/tests/test_bitinfo.py b/numcodecs/tests/test_bitinfo.py new file mode 100644 index 00000000..2e6d33ac --- /dev/null +++ b/numcodecs/tests/test_bitinfo.py @@ -0,0 +1,73 @@ +import numpy as np + +import pytest + +from numcodecs.bitinfo import BitInfo, exponent_bias, mutual_information + +def test_bitinfo_initialization(): + bitinfo = BitInfo(0.5) + assert bitinfo.info_level == 0.5 + assert bitinfo.axes is None + + bitinfo = BitInfo(0.5, axes=1) + assert bitinfo.axes == [1] + + bitinfo = BitInfo(0.5, axes=[1, 2]) + assert bitinfo.axes == [1, 2] + + with pytest.raises(ValueError): + BitInfo(-0.1) + + with pytest.raises(ValueError): + BitInfo(1.1) + + with pytest.raises(ValueError): + BitInfo(0.5, axes=1.5) + + with pytest.raises(ValueError): + BitInfo(0.5, axes=[1, 1.5]) + + +def test_bitinfo_encode(): + bitinfo = BitInfo(info_level=0.5) + a = np.array([1.0, 2.0, 3.0], dtype="float32") + encoded = bitinfo.encode(a) + decoded = bitinfo.decode(encoded) + assert decoded.dtype == a.dtype + + +def test_bitinfo_encode_errors(): + bitinfo = BitInfo(0.5) + a = np.array([1, 2, 3], dtype="int32") + with pytest.raises(TypeError): + bitinfo.encode(a) + + a = np.array([1.0, 2.0, 3.0], dtype="float128") + with pytest.raises(TypeError): + bitinfo.encode(a) + + +def test_exponent_bias(): + assert exponent_bias("f2") == 15 + assert exponent_bias("f4") == 127 + assert exponent_bias("f8") == 1023 + + with pytest.raises(ValueError): + exponent_bias("int32") + + +def test_mutual_information(): + """ Test mutual information calculation + + Tests for changes to the mutual_information + but not the correcteness of the original. + """ + a = np.arange(10.0, dtype='float32') + b = a + 1000 + c = a[::-1].copy() + dt = np.dtype('uint32') + a,b,c = map(lambda x: x.view(dt), [a,b,c]) + + assert mutual_information(a, a).sum() == 7.020411549771797 + assert mutual_information(a, b).sum() == 0.0 + assert mutual_information(a, c).sum() == 0.6545015579460758 From f2e23946f52649a6fa854509b590cdc894d65684 Mon Sep 17 00:00:00 2001 From: thodson Date: Tue, 6 Feb 2024 21:12:59 -0600 Subject: [PATCH 08/15] Linting --- numcodecs/bitinfo.py | 8 +++----- numcodecs/tests/test_bitinfo.py | 3 ++- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/numcodecs/bitinfo.py b/numcodecs/bitinfo.py index 8783a71c..ab928ab9 100644 --- a/numcodecs/bitinfo.py +++ b/numcodecs/bitinfo.py @@ -49,7 +49,6 @@ def __init__(self, info_level: float, axes=None): self.info_level = info_level self.axes = axes - def encode(self, buf): """Create int array by rounding floating-point data @@ -251,7 +250,7 @@ def get_keepbits(info_per_bit, inflevel=0.99): ---------- info_per_bit : array Information content of each bit from `get_bitinformation`. - + inflevel : float Level of information that shall be preserved. @@ -259,7 +258,7 @@ def get_keepbits(info_per_bit, inflevel=0.99): ------- keepbits : int Number of mantissa bits to keep - + """ if (inflevel < 0) or (inflevel > 1.0): raise ValueError("Please provide `inflevel` from interval [0.,1.]") @@ -275,8 +274,7 @@ def get_keepbits(info_per_bit, inflevel=0.99): def _cdf_from_info_per_bit(info_per_bit): """Convert info_per_bit to cumulative distribution function""" - # TODO this threshold doesn't match implementation in remove_insignificant.jl - tol = info_per_bit[-4:].max() * 1.1 # reduced from 1.5 + tol = info_per_bit[-4:].max() * 1.1 # reduced from 1.5 info_per_bit[info_per_bit < tol] = 0 cdf = info_per_bit.cumsum() return cdf / cdf[-1] diff --git a/numcodecs/tests/test_bitinfo.py b/numcodecs/tests/test_bitinfo.py index 2e6d33ac..485f7145 100644 --- a/numcodecs/tests/test_bitinfo.py +++ b/numcodecs/tests/test_bitinfo.py @@ -4,6 +4,7 @@ from numcodecs.bitinfo import BitInfo, exponent_bias, mutual_information + def test_bitinfo_initialization(): bitinfo = BitInfo(0.5) assert bitinfo.info_level == 0.5 @@ -66,7 +67,7 @@ def test_mutual_information(): b = a + 1000 c = a[::-1].copy() dt = np.dtype('uint32') - a,b,c = map(lambda x: x.view(dt), [a,b,c]) + a, b, c = map(lambda x: x.view(dt), [a, b, c]) assert mutual_information(a, a).sum() == 7.020411549771797 assert mutual_information(a, b).sum() == 0.0 From 6b6fa4f5b3dcb9e315a020d2c6c3eba05e13666a Mon Sep 17 00:00:00 2001 From: Timothy Hodson <34148978+thodson-usgs@users.noreply.github.com> Date: Fri, 1 Mar 2024 13:25:54 -0600 Subject: [PATCH 09/15] Revert info_per_bit tolerance to 1.5 The xbitinfo implementation uses a tolerance factor of 1.5. I lowered the tolerance to 1.1, because I was getting poor results with my test data, but it was pointed out that the problem was that my test data were quantized. Quantization is an open issue with xbitinfo too, and we should address it first there, then patch the codec. --- numcodecs/bitinfo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numcodecs/bitinfo.py b/numcodecs/bitinfo.py index ab928ab9..9b0b6288 100644 --- a/numcodecs/bitinfo.py +++ b/numcodecs/bitinfo.py @@ -274,7 +274,7 @@ def get_keepbits(info_per_bit, inflevel=0.99): def _cdf_from_info_per_bit(info_per_bit): """Convert info_per_bit to cumulative distribution function""" - tol = info_per_bit[-4:].max() * 1.1 # reduced from 1.5 + tol = info_per_bit[-4:].max() * 1.5 info_per_bit[info_per_bit < tol] = 0 cdf = info_per_bit.cumsum() return cdf / cdf[-1] From 673d86b3cef98a0f946f8635074f1d9feb11c7a1 Mon Sep 17 00:00:00 2001 From: thodson Date: Mon, 22 Apr 2024 22:22:04 -0500 Subject: [PATCH 10/15] Remove float128 test --- numcodecs/tests/test_bitinfo.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/numcodecs/tests/test_bitinfo.py b/numcodecs/tests/test_bitinfo.py index 485f7145..aba5b34e 100644 --- a/numcodecs/tests/test_bitinfo.py +++ b/numcodecs/tests/test_bitinfo.py @@ -43,10 +43,6 @@ def test_bitinfo_encode_errors(): with pytest.raises(TypeError): bitinfo.encode(a) - a = np.array([1.0, 2.0, 3.0], dtype="float128") - with pytest.raises(TypeError): - bitinfo.encode(a) - def test_exponent_bias(): assert exponent_bias("f2") == 15 @@ -58,7 +54,7 @@ def test_exponent_bias(): def test_mutual_information(): - """ Test mutual information calculation + """Test mutual information calculation Tests for changes to the mutual_information but not the correcteness of the original. From 157c57c3248de85f9c889af4373d33808d1bde2f Mon Sep 17 00:00:00 2001 From: thodson Date: Tue, 30 Apr 2024 12:13:52 -0500 Subject: [PATCH 11/15] Add usage example --- numcodecs/bitinfo.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/numcodecs/bitinfo.py b/numcodecs/bitinfo.py index 9b0b6288..7c1a92aa 100644 --- a/numcodecs/bitinfo.py +++ b/numcodecs/bitinfo.py @@ -30,6 +30,19 @@ class BitInfo(BitRound): axes: int or list of int, optional Axes along which to calculate the bit information. If None, all axes are used. + + Examples + -------- + >>> import xarray as xr + >>> ds = xr.tutorial.open_dataset("air_temperature") + >>> # Note these data have already undergone lossy compression, + >>> # which should not be combined with bitinformation in practice + + >>> from numcodecs import Blosc, BitInfo + >>> compressor = Blosc(cname="zstd", clevel=3) + >>> filters = [BitInfo(info_level=0.99)] + >>> encoding = {"air": {"compressor": compressor, "filters": filters}} + >>> ds.to_zarr('xbit.zarr', mode="w", encoding=encoding) """ codec_id = 'bitinfo' @@ -204,8 +217,7 @@ def bitpaircount(a, b): def mutual_information(a, b, base=2): - """Calculate the mutual information between two arrays. - """ + """Calculate the mutual information between two arrays.""" assert a.dtype == b.dtype assert a.dtype.kind == "u" @@ -265,9 +277,7 @@ def get_keepbits(info_per_bit, inflevel=0.99): cdf = _cdf_from_info_per_bit(info_per_bit) bitdim_non_mantissa_bits = NMBITS[len(info_per_bit)] - keepmantissabits = ( - (cdf > inflevel).argmax() + 1 - bitdim_non_mantissa_bits - ) + keepmantissabits = (cdf > inflevel).argmax() + 1 - bitdim_non_mantissa_bits return keepmantissabits From 45ab3c4e9b7a08628283321e4c05c719cca7df04 Mon Sep 17 00:00:00 2001 From: thodson Date: Tue, 30 Apr 2024 12:44:50 -0500 Subject: [PATCH 12/15] Add xarray to test_extra dependencies --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 3fb3ad0d..aedc7dea 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,6 +59,7 @@ test = [ ] test_extras = [ "importlib_metadata", + "xarray", ] msgpack = [ "msgpack", From 78f93e3b84ddeba5627a9c261cf961f7832b9284 Mon Sep 17 00:00:00 2001 From: thodson Date: Tue, 30 Apr 2024 16:16:41 -0500 Subject: [PATCH 13/15] Add xarray dependencies for doctest --- .github/workflows/ci-osx.yaml | 75 +++++++++++++++++++++++++++++++ .github/workflows/ci-windows.yaml | 67 +++++++++++++++++++++++++++ numcodecs/bitinfo.py | 12 +++-- pyproject.toml | 5 +++ 4 files changed, 155 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/ci-osx.yaml create mode 100644 .github/workflows/ci-windows.yaml diff --git a/.github/workflows/ci-osx.yaml b/.github/workflows/ci-osx.yaml new file mode 100644 index 00000000..52394c2b --- /dev/null +++ b/.github/workflows/ci-osx.yaml @@ -0,0 +1,75 @@ +name: OSX CI + +on: [push, pull_request] + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + build: + runs-on: macos-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + + steps: + - name: Checkout source + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Set up Conda + uses: conda-incubator/setup-miniconda@v3.0.4 + with: + channels: conda-forge + miniforge-version: latest + python-version: ${{ matrix.python-version }} + + - name: Show info about `base` environment + shell: "bash -l {0}" + run: | + conda info + conda config --show-sources + conda list --show-channel-urls + + - name: Set up `env` + shell: "bash -l {0}" + run: > + conda create -n env + c-compiler cxx-compiler 'clang>=12.0.1' + python=${{matrix.python-version}} wheel pip + + - name: Show info about `env` environment + shell: "bash -l {0}" + run: | + conda list --show-channel-urls -n env + + - name: Install numcodecs + shell: "bash -l {0}" + run: | + conda activate env + export DISABLE_NUMCODECS_AVX2="" + python -m pip install -v -e .[test,test_extras,doctest,msgpack,zfpy,pcodec] + + - name: List installed packages + shell: "bash -l {0}" + run: | + conda activate env + python -m pip list + + - name: Run tests + shell: "bash -l {0}" + run: | + conda activate env + pytest -v + + - uses: codecov/codecov-action@v3 + with: + #token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos + #files: ./coverage1.xml,./coverage2.xml # optional + #flags: unittests # optional + #name: codecov-umbrella # optional + #fail_ci_if_error: true # optional (default = false) + verbose: true # optional (default = false) diff --git a/.github/workflows/ci-windows.yaml b/.github/workflows/ci-windows.yaml new file mode 100644 index 00000000..f3d18884 --- /dev/null +++ b/.github/workflows/ci-windows.yaml @@ -0,0 +1,67 @@ +name: Windows CI + +on: [push, pull_request] + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + build: + runs-on: windows-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + + steps: + - name: Checkout source + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Set up Conda + uses: conda-incubator/setup-miniconda@v3.0.4 + with: + channels: conda-forge + miniforge-version: latest + python-version: ${{ matrix.python-version }} + + - name: Set up `env` + shell: "bash -l {0}" + run: > + conda create -n env + c-compiler cxx-compiler + python=${{matrix.python-version}} wheel pip + + - name: Show info about `env` environment + shell: "bash -l {0}" + run: | + conda list --show-channel-urls -n env + + - name: Install numcodecs + shell: "bash -l {0}" + run: | + conda activate env + python -m pip install -v -e .[test,test_extras,doctest,msgpack,zfpy,pcodec] + + - name: List installed packages + shell: "bash -l {0}" + run: | + conda activate env + python -m pip list + + - name: Run tests + shell: "bash -l {0}" + run: | + conda activate env + pytest -v + + - uses: codecov/codecov-action@v3 + with: + #token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos + #files: ./coverage1.xml,./coverage2.xml # optional + #flags: unittests # optional + #name: codecov-umbrella # optional + #fail_ci_if_error: true # optional (default = false) + verbose: true # optional (default = false) diff --git a/numcodecs/bitinfo.py b/numcodecs/bitinfo.py index 7c1a92aa..3d46dd4f 100644 --- a/numcodecs/bitinfo.py +++ b/numcodecs/bitinfo.py @@ -33,16 +33,20 @@ class BitInfo(BitRound): Examples -------- + This example applies the BitInfo codec to an xarray dataset and + writes the data out using xarray's `to_zarr` method. Using this pattern, the + information content is computed chunkwise, which is recommended for + datasets with variable information content. Note that these data have + been quantized creating erroneous results, which is apparent in + the output. Do not use with quantized data in practice. + >>> import xarray as xr >>> ds = xr.tutorial.open_dataset("air_temperature") - >>> # Note these data have already undergone lossy compression, - >>> # which should not be combined with bitinformation in practice - >>> from numcodecs import Blosc, BitInfo >>> compressor = Blosc(cname="zstd", clevel=3) >>> filters = [BitInfo(info_level=0.99)] >>> encoding = {"air": {"compressor": compressor, "filters": filters}} - >>> ds.to_zarr('xbit.zarr', mode="w", encoding=encoding) + >>> _ = ds.to_zarr('xbit.zarr', mode="w", encoding=encoding) """ codec_id = 'bitinfo' diff --git a/pyproject.toml b/pyproject.toml index aedc7dea..d19018b4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,7 +59,12 @@ test = [ ] test_extras = [ "importlib_metadata", +] +doctest = [ "xarray", + "pooch", + "netCDF4", + "zarr", ] msgpack = [ "msgpack", From 385c40717784c7aaaecec05ce043a5147fff9072 Mon Sep 17 00:00:00 2001 From: thodson Date: Thu, 2 May 2024 15:14:23 -0500 Subject: [PATCH 14/15] Remove usage example from doctest --- .github/workflows/ci-osx.yaml | 10 +++++----- .github/workflows/ci-windows.yaml | 10 +++++----- .github/workflows/ci.yaml | 9 +++++++++ numcodecs/bitinfo.py | 14 +++++++------- pyproject.toml | 6 ------ 5 files changed, 26 insertions(+), 23 deletions(-) diff --git a/.github/workflows/ci-osx.yaml b/.github/workflows/ci-osx.yaml index 52394c2b..9b30d587 100644 --- a/.github/workflows/ci-osx.yaml +++ b/.github/workflows/ci-osx.yaml @@ -51,7 +51,7 @@ jobs: run: | conda activate env export DISABLE_NUMCODECS_AVX2="" - python -m pip install -v -e .[test,test_extras,doctest,msgpack,zfpy,pcodec] + python -m pip install -v -e .[test,test_extras,msgpack,zfpy,pcodec] - name: List installed packages shell: "bash -l {0}" @@ -68,8 +68,8 @@ jobs: - uses: codecov/codecov-action@v3 with: #token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos - #files: ./coverage1.xml,./coverage2.xml # optional - #flags: unittests # optional - #name: codecov-umbrella # optional - #fail_ci_if_error: true # optional (default = false) + #files: ./coverage1.xml,./coverage2.xml # optional + #flags: unittests # optional + #name: codecov-umbrella # optional + #fail_ci_if_error: true # optional (default = false) verbose: true # optional (default = false) diff --git a/.github/workflows/ci-windows.yaml b/.github/workflows/ci-windows.yaml index f3d18884..397be043 100644 --- a/.github/workflows/ci-windows.yaml +++ b/.github/workflows/ci-windows.yaml @@ -43,7 +43,7 @@ jobs: shell: "bash -l {0}" run: | conda activate env - python -m pip install -v -e .[test,test_extras,doctest,msgpack,zfpy,pcodec] + python -m pip install -v -e .[test,test_extras,msgpack,zfpy,pcodec] - name: List installed packages shell: "bash -l {0}" @@ -60,8 +60,8 @@ jobs: - uses: codecov/codecov-action@v3 with: #token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos - #files: ./coverage1.xml,./coverage2.xml # optional - #flags: unittests # optional - #name: codecov-umbrella # optional - #fail_ci_if_error: true # optional (default = false) + #files: ./coverage1.xml,./coverage2.xml # optional + #flags: unittests # optional + #name: codecov-umbrella # optional + #fail_ci_if_error: true # optional (default = false) verbose: true # optional (default = false) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 3c4aca34..1b6adcb8 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -85,5 +85,14 @@ jobs: - uses: codecov/codecov-action@v4 with: +<<<<<<< HEAD:.github/workflows/ci.yaml token: ${{ secrets.CODECOV_TOKEN }} verbose: true +======= + #token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos + #files: ./coverage1.xml,./coverage2.xml # optional + #flags: unittests # optional + #name: codecov-umbrella # optional + #fail_ci_if_error: true # optional (default = false) + verbose: true # optional (default = false) +>>>>>>> ba38507 (Remove usage example from doctest):.github/workflows/ci-linux.yaml diff --git a/numcodecs/bitinfo.py b/numcodecs/bitinfo.py index 3d46dd4f..84dce586 100644 --- a/numcodecs/bitinfo.py +++ b/numcodecs/bitinfo.py @@ -40,13 +40,13 @@ class BitInfo(BitRound): been quantized creating erroneous results, which is apparent in the output. Do not use with quantized data in practice. - >>> import xarray as xr - >>> ds = xr.tutorial.open_dataset("air_temperature") - >>> from numcodecs import Blosc, BitInfo - >>> compressor = Blosc(cname="zstd", clevel=3) - >>> filters = [BitInfo(info_level=0.99)] - >>> encoding = {"air": {"compressor": compressor, "filters": filters}} - >>> _ = ds.to_zarr('xbit.zarr', mode="w", encoding=encoding) + import xarray as xr + ds = xr.tutorial.open_dataset("air_temperature") + from numcodecs import Blosc, BitInfo + compressor = Blosc(cname="zstd", clevel=3) + filters = [BitInfo(info_level=0.99)] + encoding = {"air": {"compressor": compressor, "filters": filters}} + ds.to_zarr('xbit.zarr', mode="w", encoding=encoding) """ codec_id = 'bitinfo' diff --git a/pyproject.toml b/pyproject.toml index d19018b4..3fb3ad0d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,12 +60,6 @@ test = [ test_extras = [ "importlib_metadata", ] -doctest = [ - "xarray", - "pooch", - "netCDF4", - "zarr", -] msgpack = [ "msgpack", ] From 50dcea9cd1dc90d6af9cc42841106340d1954508 Mon Sep 17 00:00:00 2001 From: thodson Date: Mon, 9 Sep 2024 09:27:48 -0500 Subject: [PATCH 15/15] Rebase and update release.rst --- docs/release.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/release.rst b/docs/release.rst index a3b47185..cf00e0e0 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -13,7 +13,8 @@ Unreleased Enhancements ~~~~~~~~~~~~ - +* Add bitinfo codec + By :user:`Timothy Hodson ` :pr:`503`. Fix ~~~