Merge pull request #170 from Princeton-LSI-ResearchComputing/dupe_accucor_samples

hepcat72 · web-flow · commit 0f85bf41d019 · 2021-08-27T12:41:55.000-04:00
Sample name uniqueness check added to accucor data loader
diff --git a/DataRepo/example_data/obob_maven_6eaas_inf_sample_dupe.xlsx b/DataRepo/example_data/obob_maven_6eaas_inf_sample_dupe.xlsx
diff --git a/DataRepo/management/commands/load_accucor_msruns.py b/DataRepo/management/commands/load_accucor_msruns.py
@@ -1,6 +1,7 @@
 import os.path
 
 import pandas as pd
+from django.core.exceptions import ValidationError
 from django.core.management import BaseCommand
 
 from DataRepo.utils import AccuCorDataLoader
@@ -56,10 +57,49 @@ def add_arguments(self, parser):
 
     def handle(self, *args, **options):
         print("Reading accucor file: " + options["accucor_file"])
+
+        # Note, setting `mangle_dupe_cols=False` results in `Setting mangle_dupe_cols=False is not supported yet`, so
+        # the following is to catch duplicate headers
+        orig_heads = pd.read_excel(
+            options["accucor_file"],
+            nrows=1,
+            header=None,
+            sheet_name=0,
+            engine="openpyxl",
+            squeeze=True,
+        ).iloc[0]
+        num_uniq_orig_heads = len(pd.unique(orig_heads))
+        num_orig_heads = len(orig_heads)
+        if num_uniq_orig_heads != num_orig_heads:
+            raise ValidationError(
+                f"Column headers in Original data sheet are not unique. There are {num_orig_heads} columns and "
+                f"{num_uniq_orig_heads} unique values"
+            )
+
+        corr_heads = pd.read_excel(
+            options["accucor_file"],
+            nrows=1,
+            header=None,
+            sheet_name=1,
+            engine="openpyxl",
+            squeeze=True,
+        ).iloc[0]
+        num_uniq_corr_heads = len(pd.unique(corr_heads))
+        num_corr_heads = len(corr_heads)
+        if num_uniq_corr_heads != num_corr_heads:
+            raise ValidationError(
+                "Column headers in Corrected data sheet are not unique. There are "
+                + str(num_corr_heads)
+                + " columns and "
+                + str(num_uniq_corr_heads)
+                + " unique values"
+            )
+
         # get the first 2 sheets as the original and corrected data
         original = pd.read_excel(
             options["accucor_file"], sheet_name=0, engine="openpyxl"
         ).dropna(axis=0, how="all")
+
         corrected = pd.read_excel(
             options["accucor_file"], sheet_name=1, engine="openpyxl"
         ).dropna(axis=0, how="all")
diff --git a/DataRepo/tests/test_models.py b/DataRepo/tests/test_models.py
@@ -618,6 +618,30 @@ def test_peak_data_fraction(self):
         )
         self.assertAlmostEqual(peak_data.fraction, 0.9952169753)
 
+    def test_dupe_sample_load_fails(self):
+        # Insert the dupe sample.  Samples are required to pre-exist for the accucor loader.
+        sample = Sample(
+            name="tst-dupe1",
+            researcher="Michael",
+            time_collected=timedelta(minutes=5),
+            animal=Animal.objects.all()[0],
+            tissue=Tissue.objects.all()[0],
+        )
+        sample.full_clean()
+        sample.save()
+
+        with self.assertRaises(ValidationError):
+            call_command(
+                "load_accucor_msruns",
+                protocol="Default",
+                accucor_file="DataRepo/example_data/obob_maven_6eaas_inf_sample_dupe.xlsx",
+                date="2021-08-20",
+                researcher="Michael",
+            )
+
+    def test_dupe_samples_not_loaded(self):
+        self.assertEqual(Sample.objects.filter(name__exact="tst-dupe1").count(), 0)
+
 
 class AnimalAndSampleLoadingTests(TestCase):
     @classmethod
diff --git a/DataRepo/utils.py b/DataRepo/utils.py
@@ -6,6 +6,7 @@
 import dateutil.parser  # type: ignore
 import pandas as pd
 from django.db import transaction
+from pandas.errors import EmptyDataError
 
 from DataRepo.models import (
     Animal,
@@ -348,10 +349,35 @@ def validate_dataframes(self):
             ]
             if sample not in self.skip_samples
         ]
-        err_msg = "Samples are not equivalent in the original and corrected data"
-        assert collections.Counter(original_samples) == collections.Counter(
-            corrected_samples
-        ), err_msg
+
+        # Make sure all sample columns have names
+        orig_iter = collections.Counter(original_samples)
+        orig_iter_err = ""
+        for k, v in orig_iter.items():
+            if k.startswith("Unnamed: "):
+                raise EmptyDataError(
+                    "Sample columns missing headers found in the Original data sheet. You have "
+                    + str(len(self.accucor_original_df.columns))
+                    + " columns. Be sure to delete any unused columns."
+                )
+            orig_iter_err += '"' + str(k) + '":' + str(v) + '",'
+        corr_iter = collections.Counter(corrected_samples)
+        corr_iter_err = ""
+        for k, v in corr_iter.items():
+            if k.startswith("Unnamed: "):
+                raise Exception(
+                    "Sample columns missing headers found in the Corrected data sheet. You have "
+                    + str(len(self.accucor_corrected_df.columns))
+                    + " columns."
+                )
+            corr_iter_err += '"' + str(k) + '":"' + str(v) + '",'
+
+        # Make sure that the sheets have the same number of sample columns
+        err_msg = (
+            f"Number of samples in the original and corrected sheets differ. Original: [{orig_iter_err}] Corrected: "
+            "[{corr_iter_err}]."
+        )
+        assert orig_iter == corr_iter, err_msg
         self.original_samples = original_samples
 
     def corrected_file_tracer_labeled_column_regex(self):
@@ -402,7 +428,9 @@ def retrieve_samples(self):
             except Sample.DoesNotExist:
                 missing_samples += 1
                 print(f"Could not find sample {original_sample_name} in the database.")
-        assert missing_samples == 0, f"{missing_samples} samples are missing."
+        assert (
+            missing_samples == 0
+        ), f"{missing_samples} samples are missing. See noted sample names above."
 
     def get_first_sample_column_index(self, df):