Fix HMM data caching issue

adamdivak · adamdivak · commit cfe87ad91bc1 · 2024-02-02T15:19:01.000+01:00
The Hidden Markov Model data generator created the required data upon
the first invocation, then cached these files and loaded from them
during subsequent calls. However the file name used for caching
did not include any of the parameters used for data generation.
This was a problem because the HMM unit test generated a much smaller
number of samples that contain shorter signals, while the main HMM
experiment used more samples and longer signals.

If the user first executes the unit tests and not the main code then
s/he ends up unintentionally using the shorter version of the data.
This was the reason for having incorrect values for the HMM experiment
in the original version of the published paper, which was later corrected.

The HMM module now creates different cache files for different settings,
making sure no such errors happen. Currently the file name only includes
numeric settings, which was enough to avoid the error, though in the
future it could be expanded to also include non-numeric ones as well.
diff --git a/experiments/hmm/main.py b/experiments/hmm/main.py
@@ -68,6 +68,7 @@ def main(
 
     # Load data
     hmm = HMM(n_folds=5, fold=fold, seed=seed)
+    hmm.prepare_data()
 
     print(f"Training classifier..")
 
diff --git a/tests/datasets/test_hmm.py b/tests/datasets/test_hmm.py
@@ -108,8 +108,10 @@ def test_hmm(
         fold=fold,
         num_workers=num_workers,
         seed=seed,
+        test_size=10,
+        signal_length=20
     )
-    hmm.download(split="test", test_size=10, signal_length=20)
+    hmm.download(split="test")
     x_test = hmm.preprocess(split="test")["x"]
     y_test = hmm.preprocess(split="test")["y"]
     assert tuple(x_test.shape) == (10, 20, 3)
diff --git a/tint/datasets/hmm.py b/tint/datasets/hmm.py
@@ -41,6 +41,9 @@ class HMM(DataModule):
             Ignored if n_folds is None. Default to ``None``
         num_workers (int): Number of workers for the loaders. Default to 0
         seed (int): For the random split. Default to 42
+        train_size (int): Number of training samples to generate
+        test_size (int): Number of test samples to generate
+        signal_length (int): Length of the signal to generate
 
     References:
         `Explaining Time Series Predictions with Dynamic Masks <https://arxiv.org/abs/2106.05303>`_
@@ -73,6 +76,9 @@ def __init__(
         fold: int = None,
         num_workers: int = 0,
         seed: int = 42,
+        train_size: int = 800,
+        test_size: int = 200,
+        signal_length: int = 200,
     ):
         super().__init__(
             data_dir=data_dir,
@@ -92,6 +98,10 @@ def __init__(
         self.scale = scale or [[0.1, 1.6, 0.5], [-0.1, -0.4, -1.5]]
         self.p0 = p0 or [0.5]
 
+        self.train_size = train_size
+        self.test_size = test_size
+        self.signal_length = signal_length
+
     def init_dist(self):
         # Covariance matrix is constant across states but distribution
         # means change based on the state value
@@ -126,19 +136,23 @@ def next_state(previous_state, t):
         next_state = np.random.binomial(1, params)
         return next_state
 
+    def get_base_file_path(self, split):
+        return os.path.join(
+            self.data_dir,
+            (f"{split}_{self.train_size}_{self.test_size}_{self.signal_length}_{self.n_signal}_" +
+             f"{self.train}_{self.seed}_")
+        )
+
     def download(
         self,
-        train_size: int = 800,
-        test_size: int = 200,
-        signal_length: int = 200,
         split: str = "train",
     ):
-        file = os.path.join(self.data_dir, f"{split}_")
+        base_file_path = self.get_base_file_path(split)
 
         if split == "train":
-            count = train_size
+            count = self.train_size
         elif split == "test":
-            count = test_size
+            count = self.test_size
         else:
             raise NotImplementedError
 
@@ -159,7 +173,7 @@ def download(
             previous = np.random.binomial(1, self.p0)[0]
             delta_state = 0
             state_n = None
-            for i in range(signal_length):
+            for i in range(self.signal_length):
                 next = self.next_state(previous, delta_state)
                 state_n = next
 
@@ -197,37 +211,23 @@ def download(
             all_states.append(states)
             label_logits.append(y_logits)
 
-        with open(
-            os.path.join(self.data_dir, file + "features.npz"), "wb"
-        ) as fp:
+        with open(base_file_path + "features.npz", "wb") as fp:
             pkl.dump(obj=features, file=fp)
-        with open(
-            os.path.join(self.data_dir, file + "labels.npz"), "wb"
-        ) as fp:
+        with open(base_file_path + "labels.npz", "wb") as fp:
             pkl.dump(obj=labels, file=fp)
-        with open(
-            os.path.join(self.data_dir, file + "importance.npz"), "wb"
-        ) as fp:
+        with open(base_file_path + "importance.npz", "wb") as fp:
             pkl.dump(obj=importance_score, file=fp)
-        with open(
-            os.path.join(self.data_dir, file + "states.npz"), "wb"
-        ) as fp:
+        with open(base_file_path + "states.npz", "wb") as fp:
             pkl.dump(obj=all_states, file=fp)
-        with open(
-            os.path.join(self.data_dir, file + "labels_logits.npz"), "wb"
-        ) as fp:
+        with open(base_file_path + "labels_logits.npz", "wb") as fp:
             pkl.dump(obj=label_logits, file=fp)
 
     def preprocess(self, split: str = "train") -> dict:
-        file = os.path.join(self.data_dir, f"{split}_")
+        base_file_path = self.get_base_file_path(split)
 
-        with open(
-            os.path.join(self.data_dir, file + "features.npz"), "rb"
-        ) as fp:
+        with open(base_file_path + "features.npz", "rb") as fp:
             features = np.stack(pkl.load(file=fp))
-        with open(
-            os.path.join(self.data_dir, file + "labels.npz"), "rb"
-        ) as fp:
+        with open(base_file_path + "labels.npz", "rb") as fp:
             labels = np.stack(pkl.load(file=fp))
 
         return {
@@ -237,28 +237,20 @@ def preprocess(self, split: str = "train") -> dict:
 
     def prepare_data(self):
         """"""
-        if not os.path.exists(
-            os.path.join(self.data_dir, "train_features.npz")
-        ):
+        if not os.path.exists(self.get_base_file_path("train") + "features.npz"):
             self.download(split="train")
-        if not os.path.exists(
-            os.path.join(self.data_dir, "test_features.npz")
-        ):
+        if not os.path.exists(self.get_base_file_path("test") + "features.npz"):
             self.download(split="test")
 
     def true_saliency(self, split: str = "train") -> th.Tensor:
-        file = os.path.join(self.data_dir, f"{split}_")
+        base_file_path = self.get_base_file_path(split)
 
-        with open(
-            os.path.join(self.data_dir, file + "features.npz"), "rb"
-        ) as fp:
+        with open(base_file_path + "features.npz", "rb") as fp:
             features = np.stack(pkl.load(file=fp))
 
         # Load the true states that define the truly salient features
         # and define A as in Section 3.2:
-        with open(
-            os.path.join(self.data_dir, file + "states.npz"), "rb"
-        ) as fp:
+        with open(base_file_path + "states.npz", "rb") as fp:
             true_states = np.stack(pkl.load(file=fp))
             true_states += 1