Skip to content

Commit 0f85bf4

Browse files
authored
Merge pull request #170 from Princeton-LSI-ResearchComputing/dupe_accucor_samples
Sample name uniqueness check added to accucor data loader
2 parents bc9ca21 + 1b9e822 commit 0f85bf4

File tree

4 files changed

+97
-5
lines changed

4 files changed

+97
-5
lines changed
Binary file not shown.

DataRepo/management/commands/load_accucor_msruns.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import os.path
22

33
import pandas as pd
4+
from django.core.exceptions import ValidationError
45
from django.core.management import BaseCommand
56

67
from DataRepo.utils import AccuCorDataLoader
@@ -56,10 +57,49 @@ def add_arguments(self, parser):
5657

5758
def handle(self, *args, **options):
5859
print("Reading accucor file: " + options["accucor_file"])
60+
61+
# Note, setting `mangle_dupe_cols=False` results in `Setting mangle_dupe_cols=False is not supported yet`, so
62+
# the following is to catch duplicate headers
63+
orig_heads = pd.read_excel(
64+
options["accucor_file"],
65+
nrows=1,
66+
header=None,
67+
sheet_name=0,
68+
engine="openpyxl",
69+
squeeze=True,
70+
).iloc[0]
71+
num_uniq_orig_heads = len(pd.unique(orig_heads))
72+
num_orig_heads = len(orig_heads)
73+
if num_uniq_orig_heads != num_orig_heads:
74+
raise ValidationError(
75+
f"Column headers in Original data sheet are not unique. There are {num_orig_heads} columns and "
76+
f"{num_uniq_orig_heads} unique values"
77+
)
78+
79+
corr_heads = pd.read_excel(
80+
options["accucor_file"],
81+
nrows=1,
82+
header=None,
83+
sheet_name=1,
84+
engine="openpyxl",
85+
squeeze=True,
86+
).iloc[0]
87+
num_uniq_corr_heads = len(pd.unique(corr_heads))
88+
num_corr_heads = len(corr_heads)
89+
if num_uniq_corr_heads != num_corr_heads:
90+
raise ValidationError(
91+
"Column headers in Corrected data sheet are not unique. There are "
92+
+ str(num_corr_heads)
93+
+ " columns and "
94+
+ str(num_uniq_corr_heads)
95+
+ " unique values"
96+
)
97+
5998
# get the first 2 sheets as the original and corrected data
6099
original = pd.read_excel(
61100
options["accucor_file"], sheet_name=0, engine="openpyxl"
62101
).dropna(axis=0, how="all")
102+
63103
corrected = pd.read_excel(
64104
options["accucor_file"], sheet_name=1, engine="openpyxl"
65105
).dropna(axis=0, how="all")

DataRepo/tests/test_models.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -618,6 +618,30 @@ def test_peak_data_fraction(self):
618618
)
619619
self.assertAlmostEqual(peak_data.fraction, 0.9952169753)
620620

621+
def test_dupe_sample_load_fails(self):
622+
# Insert the dupe sample. Samples are required to pre-exist for the accucor loader.
623+
sample = Sample(
624+
name="tst-dupe1",
625+
researcher="Michael",
626+
time_collected=timedelta(minutes=5),
627+
animal=Animal.objects.all()[0],
628+
tissue=Tissue.objects.all()[0],
629+
)
630+
sample.full_clean()
631+
sample.save()
632+
633+
with self.assertRaises(ValidationError):
634+
call_command(
635+
"load_accucor_msruns",
636+
protocol="Default",
637+
accucor_file="DataRepo/example_data/obob_maven_6eaas_inf_sample_dupe.xlsx",
638+
date="2021-08-20",
639+
researcher="Michael",
640+
)
641+
642+
def test_dupe_samples_not_loaded(self):
643+
self.assertEqual(Sample.objects.filter(name__exact="tst-dupe1").count(), 0)
644+
621645

622646
class AnimalAndSampleLoadingTests(TestCase):
623647
@classmethod

DataRepo/utils.py

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import dateutil.parser # type: ignore
77
import pandas as pd
88
from django.db import transaction
9+
from pandas.errors import EmptyDataError
910

1011
from DataRepo.models import (
1112
Animal,
@@ -348,10 +349,35 @@ def validate_dataframes(self):
348349
]
349350
if sample not in self.skip_samples
350351
]
351-
err_msg = "Samples are not equivalent in the original and corrected data"
352-
assert collections.Counter(original_samples) == collections.Counter(
353-
corrected_samples
354-
), err_msg
352+
353+
# Make sure all sample columns have names
354+
orig_iter = collections.Counter(original_samples)
355+
orig_iter_err = ""
356+
for k, v in orig_iter.items():
357+
if k.startswith("Unnamed: "):
358+
raise EmptyDataError(
359+
"Sample columns missing headers found in the Original data sheet. You have "
360+
+ str(len(self.accucor_original_df.columns))
361+
+ " columns. Be sure to delete any unused columns."
362+
)
363+
orig_iter_err += '"' + str(k) + '":' + str(v) + '",'
364+
corr_iter = collections.Counter(corrected_samples)
365+
corr_iter_err = ""
366+
for k, v in corr_iter.items():
367+
if k.startswith("Unnamed: "):
368+
raise Exception(
369+
"Sample columns missing headers found in the Corrected data sheet. You have "
370+
+ str(len(self.accucor_corrected_df.columns))
371+
+ " columns."
372+
)
373+
corr_iter_err += '"' + str(k) + '":"' + str(v) + '",'
374+
375+
# Make sure that the sheets have the same number of sample columns
376+
err_msg = (
377+
f"Number of samples in the original and corrected sheets differ. Original: [{orig_iter_err}] Corrected: "
378+
"[{corr_iter_err}]."
379+
)
380+
assert orig_iter == corr_iter, err_msg
355381
self.original_samples = original_samples
356382

357383
def corrected_file_tracer_labeled_column_regex(self):
@@ -402,7 +428,9 @@ def retrieve_samples(self):
402428
except Sample.DoesNotExist:
403429
missing_samples += 1
404430
print(f"Could not find sample {original_sample_name} in the database.")
405-
assert missing_samples == 0, f"{missing_samples} samples are missing."
431+
assert (
432+
missing_samples == 0
433+
), f"{missing_samples} samples are missing. See noted sample names above."
406434

407435
def get_first_sample_column_index(self, df):
408436

0 commit comments

Comments
 (0)