diff --git a/.github/workflows/pixi_test.yaml b/.github/workflows/pixi_test.yaml new file mode 100644 index 0000000..2311542 --- /dev/null +++ b/.github/workflows/pixi_test.yaml @@ -0,0 +1,31 @@ +name: Tests on GPU + +on: + push: + branches: + - main + tags: + - v*.*.* + pull_request: {} + +jobs: + test: + runs-on: [ubuntu-latest, gpu] + + steps: + - name: Check out the repository + uses: actions/checkout@v4 + + - name: Install pixi + run: | + curl -fsSL https://pixi.sh/install.sh | bash + echo "$HOME/.pixi/bin" >> $GITHUB_PATH + + - name: Show NVIDIA-smi + run: nvidia-smi + + - name: install pixi test environment + run: pixi install -e dev + + - name: run pytest + run: pixi run -e dev test diff --git a/bigwig_loader/intervals_to_values.py b/bigwig_loader/intervals_to_values.py index bc70ab6..c962145 100644 --- a/bigwig_loader/intervals_to_values.py +++ b/bigwig_loader/intervals_to_values.py @@ -1,5 +1,6 @@ import logging import math +from math import isnan from pathlib import Path import cupy as cp @@ -120,6 +121,7 @@ def intervals_to_values( array_start = cp.ascontiguousarray(array_start) array_end = cp.ascontiguousarray(array_end) array_value = cp.ascontiguousarray(array_value) + default_value_isnan = isnan(default_value) cuda_kernel( (grid_size,), @@ -137,6 +139,8 @@ def intervals_to_values( sequence_length, max_number_intervals, window_size, + default_value, + default_value_isnan, out, ), ) diff --git a/cuda_kernels/intervals_to_values.cu b/cuda_kernels/intervals_to_values.cu index c8173d2..2d1299d 100644 --- a/cuda_kernels/intervals_to_values.cu +++ b/cuda_kernels/intervals_to_values.cu @@ -1,3 +1,5 @@ +#include + extern "C" __global__ void intervals_to_values( const unsigned int* query_starts, @@ -12,6 +14,8 @@ void intervals_to_values( const int sequence_length, const int max_number_intervals, const int window_size, + const float default_value, + const bool default_value_isnan, float* out ) { @@ -49,7 +53,7 @@ void intervals_to_values( } } else { - int track_index = i / batch_size; +// int track_index = i / batch_size; int found_start_index = found_starts[i]; int found_end_index = found_ends[i]; @@ -59,6 +63,8 @@ void intervals_to_values( int cursor = found_start_index; int window_index = 0; float summation = 0.0f; + int valid_count = 0; + int reduced_dim = sequence_length / window_size; @@ -73,19 +79,34 @@ void intervals_to_values( int end_index = min(interval_end, query_end) - query_start; if (start_index >= window_end) { - out[i * reduced_dim + window_index] = summation / window_size; + if (default_value_isnan) { + out[i * reduced_dim + window_index] = valid_count > 0 ? summation / valid_count : CUDART_NAN_F; + } else { + summation = summation + (window_size - valid_count) * default_value; + out[i * reduced_dim + window_index] = summation / window_size; + } summation = 0.0f; + valid_count = 0; window_index += 1; continue; } int number = min(window_end, end_index) - max(window_start, start_index); - summation += number * track_values[cursor]; + if (number > 0) { + summation += number * track_values[cursor]; + valid_count += number; + } if (end_index >= window_end || cursor + 1 >= found_end_index) { - out[i * reduced_dim + window_index] = summation / window_size; - summation = 0.0f; + if (default_value_isnan) { + out[i * reduced_dim + window_index] = valid_count > 0 ? summation / valid_count : CUDART_NAN_F; + } else { + summation = summation + (window_size - valid_count) * default_value; + out[i * reduced_dim + window_index] = summation / window_size; + } + summation = 0.0f; + valid_count = 0; window_index += 1; } diff --git a/example_data/some_positions.tsv b/example_data/some_positions.tsv index a813904..52bec21 100644 --- a/example_data/some_positions.tsv +++ b/example_data/some_positions.tsv @@ -1,4 +1,5 @@ chr center +chr1 45298878 chr18 61036865 chr17 12174372 chr3 65857025 diff --git a/pixi.toml b/pixi.toml new file mode 100644 index 0000000..9928f1a --- /dev/null +++ b/pixi.toml @@ -0,0 +1,44 @@ +[workspace] +channels = ["rapidsai", "conda-forge", "nvidia"] +name = "bigwig-loader" +platforms = ["linux-64"] +version = "0.1.0" + +[tasks] +test = { cmd = 'pytest -v' } + +[dependencies] +python = ">=3.10" +pip = "*" +cython="*" +c-compiler = "*" +kvikio = ">=24.06.00" +cupy = ">=12.0.0" +numpy = ">=1.21" +pandas = "*" + +[pypi-dependencies] +torch = "*" +python-dotenv = "*" +pydantic = "*" +pydantic-settings = "*" +pyfaidx = "*" +natsort = "*" +setuptools-scm = "*" +ncls = "*" +bigwig_loader = { path = ".", editable = true } + +[feature.test.pypi-dependencies] +pytest = "*" +pytest-sugar = "*" +pytest-cov = "*" +pytest-mock = "*" +pyBigWig = "*" + +[feature.dev.pypi-dependencies] +mypy = "*" +pre-commit = "*" + +[environments] +test = {features = ["test"]} +dev = {features = ["test", "dev"]} diff --git a/tests/test_against_pybigwig.py b/tests/test_against_pybigwig.py index 99e412d..a74f0e7 100644 --- a/tests/test_against_pybigwig.py +++ b/tests/test_against_pybigwig.py @@ -26,8 +26,8 @@ def get_batch(self, chromosomes, starts, ends): def test_same_output(bigwig_path): - pybigwig_collection = PyBigWigCollection(bigwig_path, first_n_files=2) - collection = BigWigCollection(bigwig_path, first_n_files=2) + pybigwig_collection = PyBigWigCollection(bigwig_path, first_n_files=3) + collection = BigWigCollection(bigwig_path, first_n_files=3) df = pd.read_csv(config.example_positions, sep="\t") df = df[df["chr"].isin(collection.get_chromosomes_present_in_all_files())] @@ -49,3 +49,31 @@ def test_same_output(bigwig_path): print(this_batch[pybigwig_batch != this_batch]) print(pybigwig_batch[pybigwig_batch != this_batch]) assert (pybigwig_batch == this_batch).all() + + +def test_same_output_with_nans(bigwig_path): + pybigwig_collection = PyBigWigCollection(bigwig_path, first_n_files=3) + collection = BigWigCollection(bigwig_path, first_n_files=3) + + df = pd.read_csv(config.example_positions, sep="\t") + df = df[df["chr"].isin(collection.get_chromosomes_present_in_all_files())] + chromosomes, starts, ends = ( + list(df["chr"]), + list(df["center"] - 1000), + list(df["center"] + 1000), + ) + + pybigwig_batch = pybigwig_collection.get_batch(chromosomes, starts, ends) + + this_batch = collection.get_batch( + chromosomes, starts, ends, default_value=np.nan + ).get() + print("PyBigWig:") + print(pybigwig_batch) + print(type(this_batch), "shape:", pybigwig_batch.shape) + print("This Library:") + print(this_batch) + print(type(this_batch), "shape:", this_batch.shape) + print(this_batch[pybigwig_batch != this_batch]) + print(pybigwig_batch[pybigwig_batch != this_batch]) + assert np.allclose(pybigwig_batch, this_batch, equal_nan=True) diff --git a/tests/test_intervals_to_values.py b/tests/test_intervals_to_values.py index 9ef5215..8098b33 100644 --- a/tests/test_intervals_to_values.py +++ b/tests/test_intervals_to_values.py @@ -21,7 +21,8 @@ def test_throw_exception_when_queried_intervals_are_of_different_lengths() -> No ) -def test_get_values_from_intervals() -> None: +@pytest.mark.parametrize("default_value", [0.0, cp.nan]) +def test_get_values_from_intervals(default_value) -> None: """Probably most frequent situation.""" track_starts = cp.asarray([1, 3, 10, 12, 16], dtype=cp.int32) track_ends = cp.asarray([3, 10, 12, 16, 20], dtype=cp.int32) @@ -30,15 +31,22 @@ def test_get_values_from_intervals() -> None: query_ends = cp.asarray([17], dtype=cp.int32) reserved = cp.zeros((1, 15), dtype=cp.float32) values = intervals_to_values( - track_starts, track_ends, track_values, query_starts, query_ends, reserved + track_starts, + track_ends, + track_values, + query_starts, + query_ends, + default_value=default_value, + out=reserved, ) - assert ( - values - == cp.asarray([[20, 15, 15, 15, 15, 15, 15, 15, 30, 30, 40, 40, 40, 40, 50]]) - ).all() + expected = cp.asarray( + [[20, 15, 15, 15, 15, 15, 15, 15, 30, 30, 40, 40, 40, 40, 50]] + ) + assert cp.allclose(expected, values, equal_nan=True) -def test_get_values_from_intervals_edge_case_1() -> None: +@pytest.mark.parametrize("default_value", [0.0, cp.nan]) +def test_get_values_from_intervals_edge_case_1(default_value) -> None: """Query start is somewhere in a "gap".""" track_starts = cp.asarray([1, 10, 12, 16], dtype=cp.int32) track_ends = cp.asarray([3, 12, 16, 20], dtype=cp.int32) @@ -47,19 +55,28 @@ def test_get_values_from_intervals_edge_case_1() -> None: query_ends = cp.asarray([18], dtype=cp.int32) reserved = cp.zeros((1, 12), dtype=cp.dtype(" None: +@pytest.mark.parametrize("default_value", [0.0, cp.nan]) +def test_get_values_from_intervals_edge_case_2(default_value) -> None: """Query start is exactly at start index after "gap".""" track_starts = cp.asarray([1, 10, 12, 16], dtype=cp.int32) track_ends = cp.asarray([3, 12, 16, 20], dtype=cp.int32) @@ -68,15 +85,23 @@ def test_get_values_from_intervals_edge_case_2() -> None: query_ends = cp.asarray([18], dtype=cp.int32) reserved = cp.zeros((1, 8), dtype=cp.dtype(" None: +@pytest.mark.parametrize("default_value", [0.0, cp.nan]) +def test_get_values_from_intervals_edge_case_3(default_value) -> None: """Query end is somewhere in a "gap".""" track_starts = cp.asarray([5, 10, 12, 18], dtype=cp.int32) track_ends = cp.asarray([10, 12, 14, 20], dtype=cp.int32) @@ -85,15 +110,24 @@ def test_get_values_from_intervals_edge_case_3() -> None: query_ends = cp.asarray([16], dtype=cp.int32) reserved = cp.zeros((1, 7), dtype=cp.dtype(" None: +@pytest.mark.parametrize("default_value", [0.0, cp.nan]) +def test_get_values_from_intervals_edge_case_4(default_value) -> None: """Query end is exactly at end index before "gap".""" track_starts = cp.asarray([5, 10, 12, 18], dtype=cp.int32) track_ends = cp.asarray([10, 12, 14, 20], dtype=cp.int32) @@ -102,7 +136,13 @@ def test_get_values_from_intervals_edge_case_4() -> None: query_ends = cp.asarray([14], dtype=cp.int32) reserved = cp.zeros((1, 5), dtype=cp.dtype(" None: ] - query_starts[0] -def test_get_values_from_intervals_edge_case_5() -> None: +@pytest.mark.parametrize("default_value", [0.0, cp.nan]) +def test_get_values_from_intervals_edge_case_5(default_value) -> None: """Query end is exactly at end index before "gap".""" track_starts = cp.asarray([5, 10, 12, 18], dtype=cp.uint32) track_ends = cp.asarray([10, 12, 14, 20], dtype=cp.uint32) @@ -123,19 +164,28 @@ def test_get_values_from_intervals_edge_case_5() -> None: query_ends = cp.asarray([20], dtype=cp.uint32) reserved = cp.zeros((1, 11), dtype=cp.dtype(" None: +@pytest.mark.parametrize("default_value", [0.0, cp.nan]) +def test_get_values_from_intervals_batch_of_2(default_value) -> None: """Query end is exactly at end index before "gap".""" track_starts = cp.asarray([5, 10, 12, 18], dtype=cp.int32) track_ends = cp.asarray([10, 12, 14, 20], dtype=cp.int32) @@ -144,20 +194,30 @@ def test_get_values_from_intervals_batch_of_2() -> None: query_ends = cp.asarray([18, 20], dtype=cp.int32) reserved = cp.zeros([2, 11], dtype=cp.dtype(" None: +@pytest.mark.parametrize("default_value", [0.0, cp.nan]) +def test_get_values_from_intervals_batch_multiple_tracks(default_value) -> None: """Query end is exactly at end index before "gap".""" track_starts = cp.asarray( [5, 10, 12, 18, 8, 9, 10, 18, 25, 10, 100, 1000], dtype=cp.int32 @@ -178,27 +238,29 @@ def test_get_values_from_intervals_batch_multiple_tracks() -> None: track_values, query_starts, query_ends, - reserved, + default_value=default_value, + out=reserved, sizes=cp.asarray([4, 5, 3], dtype=cp.int32), ) + x = default_value expected = cp.asarray( [ [ - [20.0, 20.0, 20.0, 30.0, 30.0, 40.0, 40.0, 0.0, 0.0, 0.0, 0.0], - [20.0, 30.0, 30.0, 40.0, 40.0, 0.0, 0.0, 0.0, 0.0, 50.0, 50.0], - [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], - [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + [20.0, 20.0, 20.0, 30.0, 30.0, 40.0, 40.0, x, x, x, x], + [20.0, 30.0, 30.0, 40.0, 40.0, x, x, x, x, 50.0, 50.0], + [x, x, x, x, x, x, x, x, x, x, x], + [x, x, x, x, x, x, x, x, x, x, x], ], [ - [0.0, 60.0, 70.0, 80.0, 80.0, 80.0, 80.0, 0.0, 0.0, 0.0, 0.0], - [70.0, 80.0, 80.0, 80.0, 80.0, 0.0, 0.0, 0.0, 0.0, 90.0, 90.0], - [90.0, 90.0, 0.0, 0.0, 0.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0], - [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + [x, 60.0, 70.0, 80.0, 80.0, 80.0, 80.0, x, x, x, x], + [70.0, 80.0, 80.0, 80.0, 80.0, x, x, x, x, 90.0, 90.0], + [90.0, 90.0, x, x, x, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0], + [x, x, x, x, x, x, x, x, x, x, x], ], [ - [0.0, 0.0, 0.0, 110.0, 110.0, 110.0, 110.0, 110.0, 110.0, 110.0, 110.0], + [x, x, x, 110.0, 110.0, 110.0, 110.0, 110.0, 110.0, 110.0, 110.0], [ - 0.0, + x, 110.0, 110.0, 110.0, @@ -210,9 +272,9 @@ def test_get_values_from_intervals_batch_multiple_tracks() -> None: 110.0, 110.0, ], - [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + [x, x, x, x, x, x, x, x, x, x, x], [ - 0.0, + x, 120.0, 120.0, 120.0, @@ -229,35 +291,4 @@ def test_get_values_from_intervals_batch_multiple_tracks() -> None: ) print(expected) print(values) - assert (values == expected).all() - - -def test_default_nan() -> None: - """Query end is exactly at end index before "gap" - Now instead of zeros, NaN values should be - used. - .""" - track_starts = cp.asarray([5, 10, 12, 18], dtype=cp.int32) - track_ends = cp.asarray([10, 12, 14, 20], dtype=cp.int32) - track_values = cp.asarray([20.0, 30.0, 40.0, 50.0], dtype=cp.dtype("f4")) - query_starts = cp.asarray([7, 9], dtype=cp.int32) - query_ends = cp.asarray([18, 20], dtype=cp.int32) - reserved = cp.zeros([2, 11], dtype=cp.dtype(" None: +@pytest.mark.parametrize("default_value", [0.0, cp.nan]) +def test_get_values_from_intervals_window(default_value) -> None: """.""" track_starts = cp.asarray([1, 3, 10, 12, 16], dtype=cp.int32) track_ends = cp.asarray([3, 10, 12, 16, 20], dtype=cp.int32) @@ -20,8 +23,9 @@ def test_get_values_from_intervals_window() -> None: track_values, query_starts, query_ends, - reserved, window_size=5, + default_value=default_value, + out=reserved, ) expected = cp.asarray([[16.0, 21.0, 42.0]]) @@ -33,7 +37,8 @@ def test_get_values_from_intervals_window() -> None: assert (values == expected).all() -def test_get_values_from_intervals_edge_case_1() -> None: +@pytest.mark.parametrize("default_value", [0.0, cp.nan]) +def test_get_values_from_intervals_edge_case_1(default_value) -> None: """Query start is somewhere in a "gap".""" track_starts = cp.asarray([1, 10, 12, 16], dtype=cp.int32) track_ends = cp.asarray([3, 12, 16, 20], dtype=cp.int32) @@ -47,21 +52,27 @@ def test_get_values_from_intervals_edge_case_1() -> None: track_values, query_starts, query_ends, - reserved, + default_value=default_value, + out=reserved, window_size=3, ) - expected = cp.asarray([[0.0, 20.0, 40.0, 46.666668]]) + x = default_value + if isnan(default_value): + expected = cp.asarray([[x, 30.0, 40.0, 46.666668]]) + else: + expected = cp.asarray([[x, 20.0, 40.0, 46.666668]]) print(expected) print(values) assert ( - cp.allclose(values, expected) + cp.allclose(values, expected, equal_nan=True) and expected.shape[-1] == (query_ends[0] - query_starts[0]) / 3 ) -def test_get_values_from_intervals_edge_case_2() -> None: +@pytest.mark.parametrize("default_value", [0.0, cp.nan]) +def test_get_values_from_intervals_edge_case_2(default_value) -> None: """Query start is exactly at start index after "gap".""" track_starts = cp.asarray([1, 10, 12, 16], dtype=cp.int32) track_ends = cp.asarray([3, 12, 16, 20], dtype=cp.int32) @@ -75,17 +86,21 @@ def test_get_values_from_intervals_edge_case_2() -> None: track_values, query_starts, query_ends, - reserved, window_size=4, + default_value=default_value, + out=reserved, ) expected = cp.asarray([[35.0, 45.0]]) + print(expected) + print(values) assert ( - cp.allclose(values, expected) + cp.allclose(values, expected, equal_nan=True) and expected.shape[-1] == (query_ends[0] - query_starts[0]) / 4 ) -def test_get_values_from_intervals_edge_case_3() -> None: +@pytest.mark.parametrize("default_value", [0.0, cp.nan]) +def test_get_values_from_intervals_edge_case_3(default_value) -> None: """Query end is somewhere in a "gap".""" track_starts = cp.asarray([5, 10, 12, 18], dtype=cp.int32) track_ends = cp.asarray([10, 12, 14, 20], dtype=cp.int32) @@ -99,18 +114,25 @@ def test_get_values_from_intervals_edge_case_3() -> None: track_values, query_starts, query_ends, - reserved, window_size=4, + default_value=default_value, + out=reserved, ) # expected = cp.asarray([[20, 20, 30, 30, 40, 40, 0, 0]]) - expected = cp.asarray([[25.0, 20.0]]) - - assert (values == expected).all() and expected.shape[-1] == ( - query_ends[0] - query_starts[0] - ) / 4 + if isnan(default_value): + expected = cp.asarray([[25.0, 40.0]]) + else: + expected = cp.asarray([[25.0, 20.0]]) + print(expected) + print(values) + assert ( + cp.allclose(expected, values, equal_nan=True) + and expected.shape[-1] == (query_ends[0] - query_starts[0]) / 4 + ) -def test_get_values_from_intervals_edge_case_4() -> None: +@pytest.mark.parametrize("default_value", [0.0, cp.nan]) +def test_get_values_from_intervals_edge_case_4(default_value) -> None: """Query end is exactly at end index before "gap".""" track_starts = cp.asarray([5, 10, 12, 18], dtype=cp.int32) track_ends = cp.asarray([10, 12, 14, 20], dtype=cp.int32) @@ -124,8 +146,9 @@ def test_get_values_from_intervals_edge_case_4() -> None: track_values, query_starts, query_ends, - reserved, window_size=3, + default_value=default_value, + out=reserved, ) # without window function:[[20, 20, 30, 30, 40, 40]] expected = cp.asarray([[23.333334, 36.666668]]) @@ -134,12 +157,13 @@ def test_get_values_from_intervals_edge_case_4() -> None: print(values) assert ( - cp.allclose(values, expected) + cp.allclose(values, expected, equal_nan=True) and expected.shape[-1] == (query_ends[0] - query_starts[0]) / 3 ) -def test_get_values_from_intervals_edge_case_5() -> None: +@pytest.mark.parametrize("default_value", [0.0, cp.nan]) +def test_get_values_from_intervals_edge_case_5(default_value) -> None: """Query end is exactly at end index before "gap".""" track_starts = cp.asarray([5, 10, 12, 18], dtype=cp.uint32) track_ends = cp.asarray([10, 12, 14, 20], dtype=cp.uint32) @@ -153,21 +177,27 @@ def test_get_values_from_intervals_edge_case_5() -> None: track_values, query_starts, query_ends, - reserved, window_size=3, + default_value=default_value, + out=reserved, ) - expected = cp.asarray([[23.333334, 36.666668, 0.0, 33.333332]]) + x = default_value + if isnan(default_value): + expected = cp.asarray([[23.333334, 36.666668, x, 50.0]]) + else: + expected = cp.asarray([[23.333334, 36.666668, x, 33.333332]]) print(expected) print(values) assert ( - cp.allclose(values, expected) + cp.allclose(values, expected, equal_nan=True) and expected.shape[-1] == (query_ends[0] - query_starts[0]) / 3 ) -def test_get_values_from_intervals_batch_of_2() -> None: +@pytest.mark.parametrize("default_value", [0.0, cp.nan]) +def test_get_values_from_intervals_batch_of_2(default_value) -> None: """Query end is exactly at end index before "gap".""" track_starts = cp.asarray([5, 10, 12, 18], dtype=cp.int32) track_ends = cp.asarray([10, 12, 14, 20], dtype=cp.int32) @@ -181,22 +211,29 @@ def test_get_values_from_intervals_batch_of_2() -> None: track_values, query_starts, query_ends, - reserved, window_size=3, + default_value=default_value, + out=reserved, ) # expected without window function: # [20.0, 20.0, 20.0, 20.0, 30.0, 30.0, 40.0, 40.0, 0.0, 0.0, 0.0, 0.0] # [20.0, 20.0, 30.0, 30.0, 40.0, 40.0, 0.0, 0.0, 0.0, 0.0, 50.0, 50.0] - expected = cp.asarray( - [[20.0, 26.666666, 26.666666, 0.0], [23.333334, 36.666668, 0.0, 33.333332]] - ) + if isnan(default_value): + expected = cp.asarray( + [[20.0, 26.666666, 40.0, cp.nan], [23.333334, 36.666668, cp.nan, 50.0]] + ) + else: + expected = cp.asarray( + [[20.0, 26.666666, 26.666666, 0.0], [23.333334, 36.666668, 0.0, 33.333332]] + ) print(expected) print(values) - assert cp.allclose(values, expected) + assert cp.allclose(values, expected, equal_nan=True) -def test_one_track_one_sample() -> None: +@pytest.mark.parametrize("default_value", [0.0, cp.nan]) +def test_one_track_one_sample(default_value) -> None: """ This tests a specific combination of track and batch index of the larger test case below: @@ -220,15 +257,17 @@ def test_one_track_one_sample() -> None: track_values, query_starts, query_ends, - reserved, sizes=cp.asarray([4], dtype=cp.int32), window_size=3, + default_value=default_value, + out=reserved, ) + x = default_value expected = cp.asarray( [ [ - [20.0, 30.0, 30.0, 40.0, 40.0, 0.0, 0.0, 0.0, 0.0, 50.0, 50.0], + [20.0, 30.0, 30.0, 40.0, 40.0, x, x, x, x, 50.0, 50.0], ], ] ) @@ -236,9 +275,9 @@ def test_one_track_one_sample() -> None: def apply_window(full_matrix): return cp.stack( [ - cp.mean(full_matrix[:, :, :3], axis=2), - cp.mean(full_matrix[:, :, 3:6], axis=2), - cp.mean(full_matrix[:, :, 6:9], axis=2), + cp.nanmean(full_matrix[:, :, :3], axis=2), + cp.nanmean(full_matrix[:, :, 3:6], axis=2), + cp.nanmean(full_matrix[:, :, 6:9], axis=2), ], axis=-1, ) @@ -249,10 +288,11 @@ def apply_window(full_matrix): print(expected) print("actual:") print(values) - assert cp.allclose(values, expected) + assert cp.allclose(values, expected, equal_nan=True) -def test_one_track_one_sample_2() -> None: +@pytest.mark.parametrize("default_value", [0.0, cp.nan]) +def test_one_track_one_sample_2(default_value) -> None: """ This tests a specific combination of track and batch index of the larger test case below: @@ -276,16 +316,17 @@ def test_one_track_one_sample_2() -> None: track_values, query_starts, query_ends, - reserved, sizes=cp.asarray([3], dtype=cp.int32), window_size=3, + default_value=default_value, + out=reserved, ) - + x = default_value expected = cp.asarray( [ [ [ - 0.0, + x, 110.0, 110.0, 110.0, @@ -304,9 +345,9 @@ def test_one_track_one_sample_2() -> None: def apply_window(full_matrix): return cp.stack( [ - cp.mean(full_matrix[:, :, :3], axis=2), - cp.mean(full_matrix[:, :, 3:6], axis=2), - cp.mean(full_matrix[:, :, 6:9], axis=2), + cp.nanmean(full_matrix[:, :, :3], axis=2), + cp.nanmean(full_matrix[:, :, 3:6], axis=2), + cp.nanmean(full_matrix[:, :, 6:9], axis=2), ], axis=-1, ) @@ -319,10 +360,11 @@ def apply_window(full_matrix): print(values) print("difference") print(values - expected) - assert cp.allclose(values, expected, atol=1e-2, rtol=1e-2) + assert cp.allclose(values, expected, atol=1e-2, rtol=1e-2, equal_nan=True) -def test_get_values_from_intervals_multiple_tracks() -> None: +@pytest.mark.parametrize("default_value", [0.0, cp.nan]) +def test_get_values_from_intervals_multiple_tracks(default_value) -> None: """Test interval_to_values with 3 tracks and a batch size of 1.""" track_starts = cp.asarray( [5, 10, 12, 18, 8, 9, 10, 18, 25, 10, 100, 1000], dtype=cp.int32 @@ -343,22 +385,24 @@ def test_get_values_from_intervals_multiple_tracks() -> None: track_values, query_starts, query_ends, - reserved, sizes=cp.asarray([4, 5, 3], dtype=cp.int32), window_size=3, + default_value=default_value, + out=reserved, ) + x = default_value expected = cp.asarray( [ [ - [20.0, 30.0, 30.0, 40.0, 40.0, 0.0, 0.0, 0.0, 0.0, 50.0, 50.0], + [20.0, 30.0, 30.0, 40.0, 40.0, x, x, x, x, 50.0, 50.0], ], [ - [70.0, 80.0, 80.0, 80.0, 80.0, 0.0, 0.0, 0.0, 0.0, 90.0, 90.0], + [70.0, 80.0, 80.0, 80.0, 80.0, x, x, x, x, 90.0, 90.0], ], [ [ - 0.0, + x, 110.0, 110.0, 110.0, @@ -377,9 +421,9 @@ def test_get_values_from_intervals_multiple_tracks() -> None: def apply_window(full_matrix): return cp.stack( [ - cp.mean(full_matrix[:, :, :3], axis=2), - cp.mean(full_matrix[:, :, 3:6], axis=2), - cp.mean(full_matrix[:, :, 6:9], axis=2), + cp.nanmean(full_matrix[:, :, :3], axis=2), + cp.nanmean(full_matrix[:, :, 3:6], axis=2), + cp.nanmean(full_matrix[:, :, 6:9], axis=2), ], axis=-1, ) @@ -392,13 +436,16 @@ def apply_window(full_matrix): print(values) print("difference") print(values - expected) - assert cp.allclose(values, expected, atol=1e-2, rtol=1e-2) + assert cp.allclose(values, expected, atol=1e-2, rtol=1e-2, equal_nan=True) @pytest.mark.parametrize( "sequence_length, window_size", product([8, 9, 10, 11, 13, 23], [2, 3, 4, 10, 11]) ) -def test_combinations_sequence_length_window_size(sequence_length, window_size) -> None: +@pytest.mark.parametrize("default_value", [0.0, cp.nan]) +def test_combinations_sequence_length_window_size( + sequence_length, window_size, default_value +) -> None: """Test intervals_to_values with 3 tracks and a batch size of 4.""" track_starts = cp.asarray( [5, 10, 12, 18, 8, 9, 10, 18, 25, 10, 100, 1000], dtype=cp.int32 @@ -421,9 +468,10 @@ def test_combinations_sequence_length_window_size(sequence_length, window_size) track_values, query_starts, query_ends, - reserved, sizes=cp.asarray([4, 5, 3], dtype=cp.int32), window_size=window_size, + default_value=default_value, + out=reserved, ) reserved = cp.zeros([3, 4, sequence_length], dtype=cp.dtype(" None: """.""" track_starts = cp.asarray([1, 3, 10, 12, 16] * n_tracks, dtype=cp.int32) - track_ends = cp.asarray([3, 10, 12, 16, 20] * n_tracks, dtype=cp.int32) + track_ends = cp.asarray([3, 8, 12, 16, 20] * n_tracks, dtype=cp.int32) track_values = cp.asarray( [20.0, 15.0, 30.0, 40.0, 50.0] * n_tracks, dtype=cp.dtype("f4") ) @@ -475,6 +525,92 @@ def test_combinations_window_size_batch_size_n_tracks( query_ends, sizes=sizes, window_size=window_size, + default_value=default_value, + ) + + values_with_window_size_1 = intervals_to_values( + track_starts, + track_ends, + track_values, + query_starts, + query_ends, + sizes=sizes, + window_size=1, + default_value=default_value, + ) + + reduced_dim = sequence_length // window_size + full_matrix = values_with_window_size_1[:, :, : reduced_dim * window_size] + full_matrix = full_matrix.reshape( + full_matrix.shape[0], full_matrix.shape[1], reduced_dim, window_size + ) + expected = cp.nanmean(full_matrix, axis=-1) + + print("expected:") + print(expected) + print("actual:") + print(values) + + assert cp.allclose(values, expected, equal_nan=True) + + +def create_random_track_data(n_tracks, min_intervals=10, max_intervals=20): + """Create random track data for testing.""" + track_starts = [] + track_ends = [] + values = [] + sizes = [] + for _ in range(n_tracks): + current_start = 0 + generate_n_intervals = np.random.randint(min_intervals, max_intervals) + for i in range(generate_n_intervals): + start = current_start + np.random.randint( + 1, 50 + ) # Ensure a gap between intervals + end = start + np.random.randint(1, 100) # Random interval length + track_starts.append(start) + track_ends.append(end) + values.append(np.random.random()) + current_start = end # Update the start for the next interval + sizes.append(generate_n_intervals) + + return ( + cp.asarray(track_starts, dtype=cp.int32), + cp.asarray(track_ends, dtype=cp.int32), + cp.asarray(values, dtype=cp.int32), + cp.asarray(sizes, dtype=cp.int32), + ) + + +def create_random_queries(batch_size, sequence_length=200): + start = np.random.randint(1, 50, size=batch_size) + end = start + sequence_length + return cp.asarray(start, dtype=cp.int32), cp.asarray(end, dtype=cp.int32) + + +@pytest.mark.parametrize( + "window_size, batch_size, n_tracks", + product([1, 2, 3, 7], [1, 2, 3, 7], [1, 2, 3, 7]), +) +@pytest.mark.parametrize("default_value", [0.0, cp.nan]) +def test_combinations_window_size_batch_size_n_tracks_on_random_data( + window_size, batch_size, n_tracks, default_value +) -> None: + sequence_length = 200 + track_starts, track_ends, track_values, sizes = create_random_track_data(n_tracks) + query_starts, query_ends = create_random_queries( + batch_size, sequence_length=sequence_length + ) + + values = intervals_to_values( + track_starts, + track_ends, + track_values, + query_starts, + query_ends, + sizes=sizes, + window_size=window_size, + default_value=default_value, ) values_with_window_size_1 = intervals_to_values( @@ -485,6 +621,7 @@ def test_combinations_window_size_batch_size_n_tracks( query_ends, sizes=sizes, window_size=1, + default_value=default_value, ) reduced_dim = sequence_length // window_size @@ -492,11 +629,15 @@ def test_combinations_window_size_batch_size_n_tracks( full_matrix = full_matrix.reshape( full_matrix.shape[0], full_matrix.shape[1], reduced_dim, window_size ) - expected = cp.mean(full_matrix, axis=-1) + expected = cp.nanmean(full_matrix, axis=-1) print("expected:") print(expected) print("actual:") print(values) - assert cp.allclose(values, expected) + assert cp.allclose(values, expected, equal_nan=True) + + +if __name__ == "__main__": + print(create_random_track_data(3))