Skip to content

Commit 5bc7113

Browse files
committed
julia groupby NA aware
1 parent 8155181 commit 5bc7113

File tree

1 file changed

+18
-16
lines changed

1 file changed

+18
-16
lines changed

juliadf/groupby-juliadf.jl

Lines changed: 18 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -36,13 +36,13 @@ print("grouping...\n"); flush(stdout);
3636

3737
question = "sum v1 by id1"; # q1
3838
GC.gc();
39-
t = @elapsed (ANS = combine(groupby(x, :id1), skipmissing(:v1) => sum => :v1); println(size(ANS)); flush(stdout));
39+
t = @elapsed (ANS = combine(groupby(x, :id1), :v1 => sumskipmissing => :v1); println(size(ANS)); flush(stdout));
4040
m = memory_usage();
4141
chkt = @elapsed chk = sum(ANS.v1);
4242
write_log(1, task, data_name, in_rows, question, size(ANS, 1), size(ANS, 2), solution, ver, git, fun, t, m, cache, make_chk(chk), chkt, on_disk);
4343
ANS = 0;
4444
GC.gc();
45-
t = @elapsed (ANS = combine(groupby(x, :id1), skipmissing(:v1) => sum => :v1); println(size(ANS)); flush(stdout));
45+
t = @elapsed (ANS = combine(groupby(x, :id1), :v1 => sumskipmissing => :v1); println(size(ANS)); flush(stdout));
4646
m = memory_usage();
4747
chkt = @elapsed chk = sum(ANS.v1);
4848
write_log(2, task, data_name, in_rows, question, size(ANS, 1), size(ANS, 2), solution, ver, git, fun, t, m, cache, make_chk(chk), chkt, on_disk);
@@ -52,13 +52,13 @@ ANS = 0;
5252

5353
question = "sum v1 by id1:id2"; # q2
5454
GC.gc();
55-
t = @elapsed (ANS = combine(groupby(x, [:id1, :id2]), :v1 => sum => :v1); println(size(ANS)); flush(stdout));
55+
t = @elapsed (ANS = combine(groupby(x, [:id1, :id2]), :v1 => sumskipmissing => :v1); println(size(ANS)); flush(stdout));
5656
m = memory_usage();
5757
chkt = @elapsed chk = sum(ANS.v1);
5858
write_log(1, task, data_name, in_rows, question, size(ANS, 1), size(ANS, 2), solution, ver, git, fun, t, m, cache, make_chk(chk), chkt, on_disk);
5959
ANS = 0;
6060
GC.gc();
61-
t = @elapsed (ANS = combine(groupby(x, [:id1, :id2]), :v1 => sum => :v1); println(size(ANS)); flush(stdout));
61+
t = @elapsed (ANS = combine(groupby(x, [:id1, :id2]), :v1 => sumskipmissing => :v1); println(size(ANS)); flush(stdout));
6262
m = memory_usage();
6363
chkt = @elapsed chk = sum(ANS.v1);
6464
write_log(2, task, data_name, in_rows, question, size(ANS, 1), size(ANS, 2), solution, ver, git, fun, t, m, cache, make_chk(chk), chkt, on_disk);
@@ -68,13 +68,13 @@ ANS = 0;
6868

6969
question = "sum v1 mean v3 by id3"; # q3
7070
GC.gc();
71-
t = @elapsed (ANS = combine(groupby(x, :id3), :v1 => sum => :v1, :v3 => mean => :v3); println(size(ANS)); flush(stdout));
71+
t = @elapsed (ANS = combine(groupby(x, :id3), :v1 => sumskipmissing => :v1, :v3 => meanskipmissing => :v3); println(size(ANS)); flush(stdout));
7272
m = memory_usage();
7373
chkt = @elapsed chk = [sum(ANS.v1), sum(ANS.v3)];
7474
write_log(1, task, data_name, in_rows, question, size(ANS, 1), size(ANS, 2), solution, ver, git, fun, t, m, cache, make_chk(chk), chkt, on_disk);
7575
ANS = 0;
7676
GC.gc();
77-
t = @elapsed (ANS = combine(groupby(x, :id3), :v1 => sum => :v1, :v3 => mean => :v3); println(size(ANS)); flush(stdout));
77+
t = @elapsed (ANS = combine(groupby(x, :id3), :v1 => sumskipmissing => :v1, :v3 => meanskipmissing => :v3); println(size(ANS)); flush(stdout));
7878
m = memory_usage();
7979
chkt = @elapsed chk = [sum(ANS.v1), sum(ANS.v3)];
8080
write_log(2, task, data_name, in_rows, question, size(ANS, 1), size(ANS, 2), solution, ver, git, fun, t, m, cache, make_chk(chk), chkt, on_disk);
@@ -84,14 +84,14 @@ ANS = 0;
8484

8585
question = "mean v1:v3 by id4"; # q4
8686
GC.gc();
87-
t = @elapsed (ANS = combine(groupby(x, :id4), :v1 => mean => :v1, :v2 => mean => :v2, :v3 => mean => :v3); println(size(ANS)); flush(stdout));
87+
t = @elapsed (ANS = combine(groupby(x, :id4), :v1 => meanskipmissing => :v1, :v2 => meanskipmissing => :v2, :v3 => meanskipmissing => :v3); println(size(ANS)); flush(stdout));
8888
m = memory_usage();
8989
t_start = time_ns();
9090
chkt = @elapsed chk = [sum(ANS.v1), sum(ANS.v2), sum(ANS.v3)];
9191
write_log(1, task, data_name, in_rows, question, size(ANS, 1), size(ANS, 2), solution, ver, git, fun, t, m, cache, make_chk(chk), chkt, on_disk);
9292
ANS = 0;
9393
GC.gc();
94-
t = @elapsed (ANS = combine(groupby(x, :id4), :v1 => mean => :v1, :v2 => mean => :v2, :v3 => mean => :v3); println(size(ANS)); flush(stdout));
94+
t = @elapsed (ANS = combine(groupby(x, :id4), :v1 => meanskipmissing => :v1, :v2 => meanskipmissing => :v2, :v3 => meanskipmissing => :v3); println(size(ANS)); flush(stdout));
9595
m = memory_usage();
9696
chkt = @elapsed chk = [sum(ANS.v1), sum(ANS.v2), sum(ANS.v3)];
9797
write_log(2, task, data_name, in_rows, question, size(ANS, 1), size(ANS, 2), solution, ver, git, fun, t, m, cache, make_chk(chk), chkt, on_disk);
@@ -101,13 +101,13 @@ ANS = 0;
101101

102102
question = "sum v1:v3 by id6"; # q5
103103
GC.gc();
104-
t = @elapsed (ANS = combine(groupby(x, :id6), :v1 => sum => :v1, :v2 => sum => :v2, :v3 => sum => :v3); println(size(ANS)); flush(stdout));
104+
t = @elapsed (ANS = combine(groupby(x, :id6), :v1 => sumskipmissing => :v1, :v2 => sumskipmissing => :v2, :v3 => sumskipmissing => :v3); println(size(ANS)); flush(stdout));
105105
m = memory_usage();
106106
chkt = @elapsed chk = [sum(ANS.v1), sum(ANS.v2), sum(ANS.v3)];
107107
write_log(1, task, data_name, in_rows, question, size(ANS, 1), size(ANS, 2), solution, ver, git, fun, t, m, cache, make_chk(chk), chkt, on_disk);
108108
ANS = 0;
109109
GC.gc();
110-
t = @elapsed (ANS = combine(groupby(x, :id6), :v1 => sum => :v1, :v2 => sum => :v2, :v3 => sum => :v3); println(size(ANS)); flush(stdout));
110+
t = @elapsed (ANS = combine(groupby(x, :id6), :v1 => sumskipmissing => :v1, :v2 => sumskipmissing => :v2, :v3 => sumskipmissing => :v3); println(size(ANS)); flush(stdout));
111111
m = memory_usage();
112112
chkt = @elapsed chk = [sum(ANS.v1), sum(ANS.v2), sum(ANS.v3)];
113113
write_log(2, task, data_name, in_rows, question, size(ANS, 1), size(ANS, 2), solution, ver, git, fun, t, m, cache, make_chk(chk), chkt, on_disk);
@@ -117,13 +117,13 @@ ANS = 0;
117117

118118
question = "median v3 sd v3 by id4 id5"; # q6
119119
GC.gc();
120-
t = @elapsed (ANS = combine(groupby(x, [:id4, :id5]), :v3 => median => :median_v3, :v3 => std => :sd_v3); println(size(ANS)); flush(stdout));
120+
t = @elapsed (ANS = combine(groupby(x, [:id4, :id5]), :v3 => medianskipmissing => :median_v3, :v3 => stdskipmissing => :sd_v3); println(size(ANS)); flush(stdout));
121121
m = memory_usage();
122122
chkt = @elapsed chk = [sum(ANS.median_v3), sum(ANS.sd_v3)];
123123
write_log(1, task, data_name, in_rows, question, size(ANS, 1), size(ANS, 2), solution, ver, git, fun, t, m, cache, make_chk(chk), chkt, on_disk);
124124
ANS = 0;
125125
GC.gc();
126-
t = @elapsed (ANS = combine(groupby(x, [:id4, :id5]), :v3 => median => :median_v3, :v3 => std => :sd_v3); println(size(ANS)); flush(stdout));
126+
t = @elapsed (ANS = combine(groupby(x, [:id4, :id5]), :v3 => medianskipmissing => :median_v3, :v3 => stdskipmissing => :sd_v3); println(size(ANS)); flush(stdout));
127127
m = memory_usage();
128128
chkt = @elapsed chk = [sum(ANS.median_v3), sum(ANS.sd_v3)];;
129129
write_log(2, task, data_name, in_rows, question, size(ANS, 1), size(ANS, 2), solution, ver, git, fun, t, m, cache, make_chk(chk), chkt, on_disk);
@@ -133,13 +133,13 @@ ANS = 0;
133133

134134
question = "max v1 - min v2 by id3"; # q7
135135
GC.gc();
136-
t = @elapsed (ANS = combine(groupby(x, :id3), [:v1, :v2] => ((v1, v2) -> maximum(v1)-minimum(v2)) => :range_v1_v2); println(size(ANS)); flush(stdout));
136+
t = @elapsed (ANS = combine(groupby(x, :id3), [:v1, :v2] => ((v1, v2) -> maximum(skipmissing(v1))-minimum(skipmissing(v2))) => :range_v1_v2); println(size(ANS)); flush(stdout));
137137
m = memory_usage();
138138
chkt = @elapsed chk = sum(ANS.range_v1_v2);
139139
write_log(1, task, data_name, in_rows, question, size(ANS, 1), size(ANS, 2), solution, ver, git, fun, t, m, cache, make_chk(chk), chkt, on_disk);
140140
ANS = 0;
141141
GC.gc();
142-
t = @elapsed (ANS = combine(groupby(x, :id3), [:v1, :v2] => ((v1, v2) -> maximum(v1)-minimum(v2)) => :range_v1_v2); println(size(ANS)); flush(stdout));
142+
t = @elapsed (ANS = combine(groupby(x, :id3), [:v1, :v2] => ((v1, v2) -> maximum(skipmissing(v1))-minimum(skipmissing(v2))) => :range_v1_v2); println(size(ANS)); flush(stdout));
143143
m = memory_usage();
144144
chkt = @elapsed chk = sum(ANS.range_v1_v2);
145145
write_log(2, task, data_name, in_rows, question, size(ANS, 1), size(ANS, 2), solution, ver, git, fun, t, m, cache, make_chk(chk), chkt, on_disk);
@@ -149,6 +149,7 @@ ANS = 0;
149149

150150
question = "largest two v3 by id6"; # q8
151151
GC.gc();
152+
## TODO
152153
t = @elapsed (ANS = combine(groupby(x, :id6), :v3 => (x -> partialsort(x, 1:min(2, length(x)), rev=true)) => :largest2_v3); println(size(ANS)); flush(stdout));
153154
m = memory_usage();
154155
chkt = @elapsed chk = sum(ANS.largest2_v3);
@@ -165,6 +166,7 @@ ANS = 0;
165166

166167
question = "regression v1 v2 by id2 id4"; # q9
167168
GC.gc();
169+
## TODO
168170
t = @elapsed (ANS = combine(groupby(x, [:id2, :id4]), [:v1, :v2] => ((v1,v2) -> cor(v1, v2)^2) => :r2); println(size(ANS)); flush(stdout));
169171
m = memory_usage();
170172
chkt = @elapsed chk = sum(ANS.r2);
@@ -181,13 +183,13 @@ ANS = 0;
181183

182184
question = "sum v3 count by id1:id6"; # q10
183185
GC.gc();
184-
t = @elapsed (ANS = combine(groupby(x, [:id1, :id2, :id3, :id4, :id5, :id6]), :v3 => sum => :v3, :v3 => length => :count); println(size(ANS)); flush(stdout));
186+
t = @elapsed (ANS = combine(groupby(x, [:id1, :id2, :id3, :id4, :id5, :id6]), :v3 => sumskipmissing => :v3, :v3 => length => :count); println(size(ANS)); flush(stdout));
185187
m = memory_usage();
186188
chkt = @elapsed chk = [sum(ANS.v3), sum(ANS.count)];
187189
write_log(1, task, data_name, in_rows, question, size(ANS, 1), size(ANS, 2), solution, ver, git, fun, t, m, cache, make_chk(chk), chkt, on_disk);
188190
ANS = 0;
189191
GC.gc();
190-
t = @elapsed (ANS = combine(groupby(x, [:id1, :id2, :id3, :id4, :id5, :id6]), :v3 => sum => :v3, :v3 => length => :count); println(size(ANS)); flush(stdout));
192+
t = @elapsed (ANS = combine(groupby(x, [:id1, :id2, :id3, :id4, :id5, :id6]), :v3 => sumskipmissing => :v3, :v3 => length => :count); println(size(ANS)); flush(stdout));
191193
m = memory_usage();
192194
chkt = @elapsed chk = [sum(ANS.v3), sum(ANS.count)];
193195
write_log(2, task, data_name, in_rows, question, size(ANS, 1), size(ANS, 2), solution, ver, git, fun, t, m, cache, make_chk(chk), chkt, on_disk);

0 commit comments

Comments
 (0)