From 9f5aed30bd2652a0db49d77f057fd573aeb9e32c Mon Sep 17 00:00:00 2001
From: Brock Mendel <jbrockmendel@gmail.com>
Date: Sat, 18 Aug 2018 18:23:34 -0700
Subject: [PATCH 1/4] use fused types for group_cummin and group_cummax

---
 pandas/_libs/groupby_helper.pxi.in | 101 +++++++++++++++++++----------
 1 file changed, 66 insertions(+), 35 deletions(-)
diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in
index 0062a6c8d31ab..497614c8bafa1 100644
--- a/pandas/_libs/groupby_helper.pxi.in
+++ b/pandas/_libs/groupby_helper.pxi.in
@@ -719,25 +719,38 @@ def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                 else:
                     out[i, j] = minx[i, j]
 
+{{endfor}}
+
+
+ctypedef fused group_t:
+    float64_t
+    float32_t
+    int64_t
+
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
-                          ndarray[{{dest_type2}}, ndim=2] values,
-                          ndarray[int64_t] labels,
-                          bint is_datetimelike):
+def group_cummin(ndarray[group_t, ndim=2] out,
+                 ndarray[group_t, ndim=2] values,
+                 ndarray[int64_t] labels,
+                 bint is_datetimelike):
     """
     Only transforms on axis=0
     """
     cdef:
         Py_ssize_t i, j, N, K, size
-        {{dest_type2}} val, mval
-        ndarray[{{dest_type2}}, ndim=2] accum
+        group_t val, mval
+        ndarray[group_t, ndim=2] accum
         int64_t lab
 
     N, K = (<object> values).shape
     accum = np.empty_like(values)
-    accum.fill({{inf_val}})
+
+    if group_t is int64_t:
+        # evaluated at compile-time
+        accum.fill(_int64_max)
+    else:
+        accum.fill(np.inf)
 
     with nogil:
         for i in range(N):
@@ -749,37 +762,50 @@ def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                 val = values[i, j]
 
                 # val = nan
-                {{if name == 'int64'}}
-                if is_datetimelike and val == {{nan_val}}:
-                    out[i, j] = {{nan_val}}
+                if group_t is int64_t:
+                    # evaluated at compile-time
+                    if is_datetimelike and val == iNaT:
+                        out[i, j] = iNaT
+                        continue
+
                 else:
-                {{else}}
-                if val == val:
-                {{endif}}
-                    mval = accum[lab, j]
-                    if val < mval:
-                        accum[lab, j] = mval = val
-                    out[i, j] = mval
+                    if val != val:
+                        continue
+
+                mval = accum[lab, j]
+                if val < mval:
+                    accum[lab, j] = mval = val
+                out[i, j] = mval
+
+
+group_cummin_float64 = group_cummin["float64_t"]
+group_cummin_float32 = group_cummin["float32_t"]
+group_cummin_int64 = group_cummin["int64_t"]
 
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
-                          ndarray[{{dest_type2}}, ndim=2] values,
-                          ndarray[int64_t] labels,
-                          bint is_datetimelike):
+def group_cummax(ndarray[group_t, ndim=2] out,
+                 ndarray[group_t, ndim=2] values,
+                 ndarray[int64_t] labels,
+                 bint is_datetimelike):
     """
     Only transforms on axis=0
     """
     cdef:
         Py_ssize_t i, j, N, K, size
-        {{dest_type2}} val, mval
-        ndarray[{{dest_type2}}, ndim=2] accum
+        group_t val, mval
+        ndarray[group_t, ndim=2] accum
         int64_t lab
 
     N, K = (<object> values).shape
     accum = np.empty_like(values)
-    accum.fill(-{{inf_val}})
+
+    if group_t is int64_t:
+        # evaluated at compile-time
+        accum.fill(-_int64_max)
+    else:
+        accum.fill(-np.inf)
 
     with nogil:
         for i in range(N):
@@ -790,16 +816,21 @@ def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
             for j in range(K):
                 val = values[i, j]
 
-                {{if name == 'int64'}}
-                if is_datetimelike and val == {{nan_val}}:
-                    out[i, j] = {{nan_val}}
+                if group_t is int64_t:
+                    # evaluated at compile-time
+                    if is_datetimelike and val == iNaT:
+                        out[i, j] = iNaT
+                        continue
                 else:
-                {{else}}
-                if val == val:
-                {{endif}}
-                    mval = accum[lab, j]
-                    if val > mval:
-                        accum[lab, j] = mval = val
-                    out[i, j] = mval
+                    if val != val:
+                        continue
 
-{{endfor}}
+                mval = accum[lab, j]
+                if val > mval:
+                    accum[lab, j] = mval = val
+                out[i, j] = mval
+
+
+group_cummax_float64 = group_cummax["float64_t"]
+group_cummax_float32 = group_cummax["float32_t"]
+group_cummax_int64 = group_cummax["int64_t"]

From 97278bccdd335e1161bae8cbbcd96eff2ea48008 Mon Sep 17 00:00:00 2001
From: Brock Mendel <jbrockmendel@gmail.com>
Date: Sun, 19 Aug 2018 10:12:22 -0700
Subject: [PATCH 2/4] use fused type for unstack

---
 pandas/_libs/algos_common_helper.pxi.in | 66 ++++++++++++-------
 pandas/_libs/reshape_helper.pxi.in      | 85 +++++++++++++++----------
 2 files changed, 93 insertions(+), 58 deletions(-)

diff --git a/pandas/_libs/algos_common_helper.pxi.in b/pandas/_libs/algos_common_helper.pxi.in
index 97b7196da80bb..c9f909baad474 100644
--- a/pandas/_libs/algos_common_helper.pxi.in
+++ b/pandas/_libs/algos_common_helper.pxi.in
@@ -19,6 +19,49 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
 # 1-d template
 #----------------------------------------------------------------------
 
+ctypedef fused algos_t:
+    float64_t
+    float32_t
+    object
+    int32_t
+    int64_t
+    uint64_t
+    uint8_t
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cpdef map_indices(ndarray[algos_t] index):
+    """
+    Produce a dict mapping the values of the input array to their respective
+    locations.
+
+    Example:
+        array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1}
+
+    Better to do this with Cython because of the enormous speed boost.
+    """
+    cdef:
+        Py_ssize_t i, length
+        dict result = {}
+
+    length = len(index)
+
+    for i in range(length):
+        result[index[i]] = i
+
+    return result
+
+
+map_indices_float64 = map_indices["float64_t"]
+map_indices_float32 = map_indices["float32_t"]
+map_indices_object = map_indices["object"]
+map_indices_int32 = map_indices["int32_t"]
+map_indices_int64 = map_indices["int64_t"]
+map_indices_uint64 = map_indices["uint64_t"]
+map_indices_uint8 = map_indices["uint8_t"]
+
+
 {{py:
 
 # name, c_type, dtype, can_hold_na, nogil
@@ -43,29 +86,6 @@ def get_dispatch(dtypes):
       in get_dispatch(dtypes)}}
 
 
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cpdef map_indices_{{name}}(ndarray[{{c_type}}] index):
-    """
-    Produce a dict mapping the values of the input array to their respective
-    locations.
-
-    Example:
-        array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1}
-
-    Better to do this with Cython because of the enormous speed boost.
-    """
-    cdef Py_ssize_t i, length
-    cdef dict result = {}
-
-    length = len(index)
-
-    for i in range(length):
-        result[index[i]] = i
-
-    return result
-
-
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def pad_{{name}}(ndarray[{{c_type}}] old, ndarray[{{c_type}}] new,
diff --git a/pandas/_libs/reshape_helper.pxi.in b/pandas/_libs/reshape_helper.pxi.in
index bb9a5977f8b45..0eab84c71ee71 100644
--- a/pandas/_libs/reshape_helper.pxi.in
+++ b/pandas/_libs/reshape_helper.pxi.in
@@ -8,34 +8,28 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
 # reshape
 # ----------------------------------------------------------------------
 
-{{py:
-
-# name, c_type
-dtypes = [('uint8', 'uint8_t'),
-          ('uint16', 'uint16_t'),
-          ('uint32', 'uint32_t'),
-          ('uint64', 'uint64_t'),
-          ('int8', 'int8_t'),
-          ('int16', 'int16_t'),
-          ('int32', 'int32_t'),
-          ('int64', 'int64_t'),
-          ('float32', 'float32_t'),
-          ('float64', 'float64_t'),
-          ('object', 'object')]
-}}
-
-{{for dtype, c_type in dtypes}}
-
+ctypedef fused reshape_t:
+    uint8_t
+    uint16_t
+    uint32_t
+    uint64_t
+    int8_t
+    int16_t
+    int32_t
+    int64_t
+    float32_t
+    float64_t
+    object
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def unstack_{{dtype}}(ndarray[{{c_type}}, ndim=2] values,
-                      ndarray[uint8_t, ndim=1] mask,
-                      Py_ssize_t stride,
-                      Py_ssize_t length,
-                      Py_ssize_t width,
-                      ndarray[{{c_type}}, ndim=2] new_values,
-                      ndarray[uint8_t, ndim=2] new_mask):
+def unstack(ndarray[reshape_t, ndim=2] values,
+            ndarray[uint8_t, ndim=1] mask,
+            Py_ssize_t stride,
+            Py_ssize_t length,
+            Py_ssize_t width,
+            ndarray[reshape_t, ndim=2] new_values,
+            ndarray[uint8_t, ndim=2] new_mask):
     """
     transform long sorted_values to wide new_values
 
@@ -50,23 +44,33 @@ def unstack_{{dtype}}(ndarray[{{c_type}}, ndim=2] values,
         result array
     new_mask : boolean ndarray
         result mask
-
     """
-
     cdef:
         Py_ssize_t i, j, w, nulls, s, offset
 
-    {{if dtype == 'object'}}
-    if True:
-    {{else}}
-    with nogil:
-    {{endif}}
+    if reshape_t is not object:
+        with nogil:
+            for i in range(stride):
+                nulls = 0
 
-        for i in range(stride):
+                for j in range(length):
+                    for w in range(width):
 
+                        offset = j * width + w
+
+                        if mask[offset]:
+                            s = i * width + w
+                            new_values[j, s] = values[offset - nulls, i]
+                            new_mask[j, s] = 1
+                        else:
+                            nulls += 1
+
+    else:
+        # identical to above version, but "with nogil" is not available
+        for i in range(stride):
             nulls = 0
-            for j in range(length):
 
+            for j in range(length):
                 for w in range(width):
 
                     offset = j * width + w
@@ -78,4 +82,15 @@ def unstack_{{dtype}}(ndarray[{{c_type}}, ndim=2] values,
                     else:
                         nulls += 1
 
-{{endfor}}
+
+unstack_uint8 = unstack["uint8_t"]
+unstack_uint16 = unstack["uint16_t"]
+unstack_uint32 = unstack["uint32_t"]
+unstack_uint64 = unstack["uint64_t"]
+unstack_int8 = unstack["int8_t"]
+unstack_int16 = unstack["int16_t"]
+unstack_int32 = unstack["int32_t"]
+unstack_int64 = unstack["int64_t"]
+unstack_float32 = unstack["float32_t"]
+unstack_float64 = unstack["float64_t"]
+unstack_object = unstack["object"]

From 7912d7181a0c8f3e421b17e67d279c38ae4998a9 Mon Sep 17 00:00:00 2001
From: Brock Mendel <jbrockmendel@gmail.com>
Date: Sun, 19 Aug 2018 11:30:43 -0700
Subject: [PATCH 3/4] use templates

---
 pandas/_libs/algos_common_helper.pxi.in | 188 ++++++++++++++++--------
 pandas/_libs/algos_take_helper.pxi.in   |  31 ++--
 2 files changed, 141 insertions(+), 78 deletions(-)

diff --git a/pandas/_libs/algos_common_helper.pxi.in b/pandas/_libs/algos_common_helper.pxi.in
index c9f909baad474..b519f20e32570 100644
--- a/pandas/_libs/algos_common_helper.pxi.in
+++ b/pandas/_libs/algos_common_helper.pxi.in
@@ -59,41 +59,17 @@ map_indices_object = map_indices["object"]
 map_indices_int32 = map_indices["int32_t"]
 map_indices_int64 = map_indices["int64_t"]
 map_indices_uint64 = map_indices["uint64_t"]
-map_indices_uint8 = map_indices["uint8_t"]
-
-
-{{py:
-
-# name, c_type, dtype, can_hold_na, nogil
-dtypes = [('float64', 'float64_t', 'np.float64', True, True),
-          ('float32', 'float32_t', 'np.float32', True, True),
-          ('object', 'object', 'object', True, False),
-          ('int32', 'int32_t', 'np.int32', False, True),
-          ('int64', 'int64_t', 'np.int64', False, True),
-          ('uint64', 'uint64_t', 'np.uint64', False, True),
-          ('bool', 'uint8_t', 'np.bool', False, True)]
-
-def get_dispatch(dtypes):
-
-    for name, c_type, dtype, can_hold_na, nogil in dtypes:
-
-        nogil_str = 'with nogil:' if nogil else ''
-        tab = '    ' if nogil else ''
-        yield name, c_type, dtype, can_hold_na, nogil_str, tab
-}}
-
-{{for name, c_type, dtype, can_hold_na, nogil_str, tab
-      in get_dispatch(dtypes)}}
+map_indices_bool = map_indices["uint8_t"]
 
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def pad_{{name}}(ndarray[{{c_type}}] old, ndarray[{{c_type}}] new,
-                 limit=None):
-    cdef Py_ssize_t i, j, nleft, nright
-    cdef ndarray[int64_t, ndim=1] indexer
-    cdef {{c_type}} cur, next
-    cdef int lim, fill_count = 0
+def pad(ndarray[algos_t] old, ndarray[algos_t] new, limit=None):
+    cdef:
+        Py_ssize_t i, j, nleft, nright
+        ndarray[int64_t, ndim=1] indexer
+        algos_t cur, next
+        int lim, fill_count = 0
 
     nleft = len(old)
     nright = len(new)
@@ -149,19 +125,28 @@ def pad_{{name}}(ndarray[{{c_type}}] old, ndarray[{{c_type}}] new,
 
     return indexer
 
+pad_float64 = pad["float64_t"]
+pad_float32 = pad["float32_t"]
+pad_object = pad["object"]
+pad_int32 = pad["int32_t"]
+pad_int64 = pad["int64_t"]
+pad_uint64 = pad["uint64_t"]
+pad_bool = pad["uint8_t"]
+
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def pad_inplace_{{name}}(ndarray[{{c_type}}] values,
-                         ndarray[uint8_t, cast=True] mask,
-                         limit=None):
-    cdef Py_ssize_t i, N
-    cdef {{c_type}} val
-    cdef int lim, fill_count = 0
+def pad_inplace(ndarray[algos_t] values,
+                ndarray[uint8_t, cast=True] mask,
+                limit=None):
+    cdef:
+        cdef Py_ssize_t i, N
+        cdef algos_t val
+        cdef int lim, fill_count = 0
 
     N = len(values)
 
-    # GH 2778
+    # GH#2778
     if N == 0:
         return
 
@@ -186,18 +171,28 @@ def pad_inplace_{{name}}(ndarray[{{c_type}}] values,
             val = values[i]
 
 
+pad_inplace_float64 = pad_inplace["float64_t"]
+pad_inplace_float32 = pad_inplace["float32_t"]
+pad_inplace_object = pad_inplace["object"]
+pad_inplace_int32 = pad_inplace["int32_t"]
+pad_inplace_int64 = pad_inplace["int64_t"]
+pad_inplace_uint64 = pad_inplace["uint64_t"]
+pad_inplace_bool = pad_inplace["uint8_t"]
+
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def pad_2d_inplace_{{name}}(ndarray[{{c_type}}, ndim=2] values,
-                            ndarray[uint8_t, ndim=2] mask,
-                            limit=None):
-    cdef Py_ssize_t i, j, N, K
-    cdef {{c_type}} val
-    cdef int lim, fill_count = 0
+def pad_2d_inplace(ndarray[algos_t, ndim=2] values,
+                   ndarray[uint8_t, ndim=2] mask,
+                   limit=None):
+    cdef:
+        Py_ssize_t i, j, N, K
+        algos_t val
+        int lim, fill_count = 0
 
     K, N = (<object> values).shape
 
-    # GH 2778
+    # GH#2778
     if N == 0:
         return
 
@@ -223,6 +218,16 @@ def pad_2d_inplace_{{name}}(ndarray[{{c_type}}, ndim=2] values,
                 fill_count = 0
                 val = values[j, i]
 
+
+pad_2d_inplace_float64 = pad_2d_inplace["float64_t"]
+pad_2d_inplace_float32 = pad_2d_inplace["float32_t"]
+pad_2d_inplace_object = pad_2d_inplace["object"]
+pad_2d_inplace_int32 = pad_2d_inplace["int32_t"]
+pad_2d_inplace_int64 = pad_2d_inplace["int64_t"]
+pad_2d_inplace_uint64 = pad_2d_inplace["uint64_t"]
+pad_2d_inplace_bool = pad_2d_inplace["uint8_t"]
+
+
 """
 Backfilling logic for generating fill vector
 
@@ -251,12 +256,12 @@ D
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def backfill_{{name}}(ndarray[{{c_type}}] old, ndarray[{{c_type}}] new,
-                      limit=None):
-    cdef Py_ssize_t i, j, nleft, nright
-    cdef ndarray[int64_t, ndim=1] indexer
-    cdef {{c_type}} cur, prev
-    cdef int lim, fill_count = 0
+def backfill(ndarray[algos_t] old, ndarray[algos_t] new,limit=None):
+    cdef:
+        cdef Py_ssize_t i, j, nleft, nright
+        cdef ndarray[int64_t, ndim=1] indexer
+        cdef algos_t cur, prev
+        cdef int lim, fill_count = 0
 
     nleft = len(old)
     nright = len(new)
@@ -314,18 +319,28 @@ def backfill_{{name}}(ndarray[{{c_type}}] old, ndarray[{{c_type}}] new,
     return indexer
 
 
+backfill_float64 = backfill["float64_t"]
+backfill_float32 = backfill["float32_t"]
+backfill_object = backfill["object"]
+backfill_int32 = backfill["int32_t"]
+backfill_int64 = backfill["int64_t"]
+backfill_uint64 = backfill["uint64_t"]
+backfill_bool = backfill["uint8_t"]
+
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def backfill_inplace_{{name}}(ndarray[{{c_type}}] values,
-                              ndarray[uint8_t, cast=True] mask,
-                              limit=None):
-    cdef Py_ssize_t i, N
-    cdef {{c_type}} val
-    cdef int lim, fill_count = 0
+def backfill_inplace(ndarray[algos_t] values,
+                     ndarray[uint8_t, cast=True] mask,
+                     limit=None):
+    cdef:
+        cdef Py_ssize_t i, N
+        cdef algos_t val
+        cdef int lim, fill_count = 0
 
     N = len(values)
 
-    # GH 2778
+    # GH#2778
     if N == 0:
         return
 
@@ -350,18 +365,28 @@ def backfill_inplace_{{name}}(ndarray[{{c_type}}] values,
             val = values[i]
 
 
+backfill_inplace_float64 = backfill_inplace["float64_t"]
+backfill_inplace_float32 = backfill_inplace["float32_t"]
+backfill_inplace_object = backfill_inplace["object"]
+backfill_inplace_int32 = backfill_inplace["int32_t"]
+backfill_inplace_int64 = backfill_inplace["int64_t"]
+backfill_inplace_uint64 = backfill_inplace["uint64_t"]
+backfill_inplace_bool = backfill_inplace["uint8_t"]
+
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def backfill_2d_inplace_{{name}}(ndarray[{{c_type}}, ndim=2] values,
-                                 ndarray[uint8_t, ndim=2] mask,
-                                 limit=None):
-    cdef Py_ssize_t i, j, N, K
-    cdef {{c_type}} val
-    cdef int lim, fill_count = 0
+def backfill_2d_inplace(ndarray[algos_t, ndim=2] values,
+                        ndarray[uint8_t, ndim=2] mask,
+                        limit=None):
+    cdef:
+        Py_ssize_t i, j, N, K
+        algos_t val
+        int lim, fill_count = 0
 
     K, N = (<object> values).shape
 
-    # GH 2778
+    # GH#2778
     if N == 0:
         return
 
@@ -388,6 +413,39 @@ def backfill_2d_inplace_{{name}}(ndarray[{{c_type}}, ndim=2] values,
                 val = values[j, i]
 
 
+backfill_2d_inplace_float64 = backfill_2d_inplace["float64_t"]
+backfill_2d_inplace_float32 = backfill_2d_inplace["float32_t"]
+backfill_2d_inplace_object = backfill_2d_inplace["object"]
+backfill_2d_inplace_int32 = backfill_2d_inplace["int32_t"]
+backfill_2d_inplace_int64 = backfill_2d_inplace["int64_t"]
+backfill_2d_inplace_uint64 = backfill_2d_inplace["uint64_t"]
+backfill_2d_inplace_bool = backfill_2d_inplace["uint8_t"]
+
+
+{{py:
+
+# name, c_type, dtype, can_hold_na, nogil
+dtypes = [('float64', 'float64_t', 'np.float64', True, True),
+          ('float32', 'float32_t', 'np.float32', True, True),
+          ('object', 'object', 'object', True, False),
+          ('int32', 'int32_t', 'np.int32', False, True),
+          ('int64', 'int64_t', 'np.int64', False, True),
+          ('uint64', 'uint64_t', 'np.uint64', False, True),
+          ('bool', 'uint8_t', 'np.bool', False, True)]
+
+def get_dispatch(dtypes):
+
+    for name, c_type, dtype, can_hold_na, nogil in dtypes:
+
+        nogil_str = 'with nogil:' if nogil else ''
+        tab = '    ' if nogil else ''
+        yield name, c_type, dtype, can_hold_na, nogil_str, tab
+}}
+
+{{for name, c_type, dtype, can_hold_na, nogil_str, tab
+      in get_dispatch(dtypes)}}
+
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def is_monotonic_{{name}}(ndarray[{{c_type}}] arr, bint timelike):
diff --git a/pandas/_libs/algos_take_helper.pxi.in b/pandas/_libs/algos_take_helper.pxi.in
index 0e69324acd341..4883e067ea8c4 100644
--- a/pandas/_libs/algos_take_helper.pxi.in
+++ b/pandas/_libs/algos_take_helper.pxi.in
@@ -264,29 +264,34 @@ def take_2d_multi_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values,
 # take_2d internal function
 #----------------------------------------------------------------------
 
-{{py:
-
-# dtype, ctype, init_result
-dtypes = [('float64', 'float64_t', 'np.empty_like(values)'),
-          ('uint64', 'uint64_t', 'np.empty_like(values)'),
-          ('object', 'object', 'values.copy()'),
-          ('int64', 'int64_t', 'np.empty_like(values)')]
-}}
+ctypedef fused take_t:
+    float64_t
+    uint64_t
+    object
+    int64_t
 
-{{for dtype, ctype, init_result in dtypes}}
 
-cdef _take_2d_{{dtype}}(ndarray[{{ctype}}, ndim=2] values, object idx):
+cdef _take_2d(ndarray[take_t, ndim=2] values, object idx):
     cdef:
         Py_ssize_t i, j, N, K
         ndarray[Py_ssize_t, ndim=2, cast=True] indexer = idx
-        ndarray[{{ctype}}, ndim=2] result
+        ndarray[take_t, ndim=2] result
         object val
 
     N, K = (<object> values).shape
-    result = {{init_result}}
+    if take_t is object:
+        result = values.copy()
+    else:
+        result = np.empty_like(values)
+
     for i in range(N):
         for j in range(K):
             result[i, j] = values[i, indexer[i, j]]
     return result
 
-{{endfor}}
+
+# TODO: Are these treated as cdefs?
+_take_2d_float64 = _take_2d[float64_t]
+_take_2d_uint64 = _take_2d[uint64_t]
+_take_2d_object = _take_2d[object]
+_take_2d_int64 = _take_2d[int64_t]

From 3e1f79a052a4d5e20b37623c41827e5601605a1d Mon Sep 17 00:00:00 2001
From: Brock Mendel <jbrockmendel@gmail.com>
Date: Sun, 19 Aug 2018 14:06:39 -0700
Subject: [PATCH 4/4] fuse more

---
 pandas/_libs/algos_common_helper.pxi.in |  41 ++++++----
 pandas/_libs/groupby_helper.pxi.in      | 101 ++++++++++++------------
 pandas/_libs/join_helper.pxi.in         |  61 +++++++++-----
 3 files changed, 117 insertions(+), 86 deletions(-)

diff --git a/pandas/_libs/algos_common_helper.pxi.in b/pandas/_libs/algos_common_helper.pxi.in
index b519f20e32570..b89cf4e4e5825 100644
--- a/pandas/_libs/algos_common_helper.pxi.in
+++ b/pandas/_libs/algos_common_helper.pxi.in
@@ -29,6 +29,31 @@ ctypedef fused algos_t:
     uint8_t
 
 
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def arrmap(ndarray[algos_t] index, object func):
+    cdef:
+        Py_ssize_t length = index.shape[0]
+        Py_ssize_t i = 0
+        ndarray[object] result = np.empty(length, dtype=np.object_)
+
+    from pandas._libs.lib import maybe_convert_objects
+
+    for i in range(length):
+        result[i] = func(index[i])
+
+    return maybe_convert_objects(result)
+
+
+arrmap_float64 = arrmap["float64_t"]
+arrmap_float32 = arrmap["float32_t"]
+arrmap_object = arrmap["object"]
+arrmap_int32 = arrmap["int32_t"]
+arrmap_int64 = arrmap["int64_t"]
+arrmap_uint64 = arrmap["uint64_t"]
+arrmap_bool = arrmap["uint8_t"]
+
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
 cpdef map_indices(ndarray[algos_t] index):
@@ -502,22 +527,6 @@ def is_monotonic_{{name}}(ndarray[{{c_type}}] arr, bint timelike):
     return is_monotonic_inc, is_monotonic_dec, \
            is_unique and (is_monotonic_inc or is_monotonic_dec)
 
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def arrmap_{{name}}(ndarray[{{c_type}}] index, object func):
-    cdef Py_ssize_t length = index.shape[0]
-    cdef Py_ssize_t i = 0
-
-    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
-
-    from pandas._libs.lib import maybe_convert_objects
-
-    for i in range(length):
-        result[i] = func(index[i])
-
-    return maybe_convert_objects(result)
-
 {{endfor}}
 
 #----------------------------------------------------------------------
diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in
index 497614c8bafa1..7b1dc8f41575c 100644
--- a/pandas/_libs/groupby_helper.pxi.in
+++ b/pandas/_libs/groupby_helper.pxi.in
@@ -593,37 +593,26 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
 # group_min, group_max
 #----------------------------------------------------------------------
 
-{{py:
-
-# name, c_type, dest_type2, nan_val
-dtypes = [('float64', 'float64_t', 'NAN', 'np.inf'),
-          ('float32', 'float32_t', 'NAN', 'np.inf'),
-          ('int64', 'int64_t', 'iNaT', '_int64_max')]
-
-def get_dispatch(dtypes):
-
-    for name, dest_type2, nan_val, inf_val in dtypes:
-        yield name, dest_type2, nan_val, inf_val
-}}
-
-
-{{for name, dest_type2, nan_val, inf_val in get_dispatch(dtypes)}}
+ctypedef fused group_t:
+    float64_t
+    float32_t
+    int64_t
 
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
-                       ndarray[int64_t] counts,
-                       ndarray[{{dest_type2}}, ndim=2] values,
-                       ndarray[int64_t] labels,
-                       Py_ssize_t min_count=-1):
+def group_max(ndarray[group_t, ndim=2] out,
+              ndarray[int64_t] counts,
+              ndarray[group_t, ndim=2] values,
+              ndarray[int64_t] labels,
+              Py_ssize_t min_count=-1):
     """
     Only aggregates on axis=0
     """
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        {{dest_type2}} val, count
-        ndarray[{{dest_type2}}, ndim=2] maxx, nobs
+        group_t val, count
+        ndarray[group_t, ndim=2] maxx, nobs
 
     assert min_count == -1, "'min_count' only used in add and prod"
 
@@ -633,7 +622,12 @@ def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
     nobs = np.zeros_like(out)
 
     maxx = np.empty_like(out)
-    maxx.fill(-{{inf_val}})
+
+    if group_t is int64_t:
+        # evaluated at compile-time
+        maxx.fill(-_int64_max)
+    else:
+        maxx.fill(-np.inf)
 
     N, K = (<object> values).shape
 
@@ -648,11 +642,9 @@ def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                 val = values[i, j]
 
                 # not nan
-                {{if name == 'int64'}}
-                if val != {{nan_val}}:
-                {{else}}
-                if val == val and val != {{nan_val}}:
-                {{endif}}
+                if ((group_t is int64_t and val != iNaT) or
+                        (group_t is not int64_t and
+                         val == val and val != NAN)):
                     nobs[lab, j] += 1
                     if val > maxx[lab, j]:
                         maxx[lab, j] = val
@@ -660,25 +652,33 @@ def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
         for i in range(ncounts):
             for j in range(K):
                 if nobs[i, j] == 0:
-                    out[i, j] = {{nan_val}}
+                    if group_t is int64_t:
+                        out[i, j] = iNaT
+                    else:
+                        out[i, j] = NAN
                 else:
                     out[i, j] = maxx[i, j]
 
 
+group_max_float64 = group_max["float64_t"]
+group_max_float32 = group_max["float32_t"]
+group_max_int64 = group_max["int64_t"]
+
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
-                       ndarray[int64_t] counts,
-                       ndarray[{{dest_type2}}, ndim=2] values,
-                       ndarray[int64_t] labels,
-                       Py_ssize_t min_count=-1):
+def group_min(ndarray[group_t, ndim=2] out,
+              ndarray[int64_t] counts,
+              ndarray[group_t, ndim=2] values,
+              ndarray[int64_t] labels,
+              Py_ssize_t min_count=-1):
     """
     Only aggregates on axis=0
     """
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        {{dest_type2}} val, count
-        ndarray[{{dest_type2}}, ndim=2] minx, nobs
+        group_t val, count
+        ndarray[group_t, ndim=2] minx, nobs
 
     assert min_count == -1, "'min_count' only used in add and prod"
 
@@ -688,7 +688,12 @@ def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
     nobs = np.zeros_like(out)
 
     minx = np.empty_like(out)
-    minx.fill({{inf_val}})
+
+    if group_t is int64_t:
+        # evaluated at compile-time
+        minx.fill(_int64_max)
+    else:
+        minx.fill(np.inf)
 
     N, K = (<object> values).shape
 
@@ -703,11 +708,9 @@ def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                 val = values[i, j]
 
                 # not nan
-                {{if name == 'int64'}}
-                if val != {{nan_val}}:
-                {{else}}
-                if val == val and val != {{nan_val}}:
-                {{endif}}
+                if ((group_t is int64_t and val != iNaT) or
+                        (group_t is not int64_t and
+                         val == val and val != NAN)):
                     nobs[lab, j] += 1
                     if val < minx[lab, j]:
                         minx[lab, j] = val
@@ -715,17 +718,17 @@ def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
         for i in range(ncounts):
             for j in range(K):
                 if nobs[i, j] == 0:
-                    out[i, j] = {{nan_val}}
+                    if group_t is int64_t:
+                        out[i, j] = iNaT
+                    else:
+                        out[i, j] = NAN
                 else:
                     out[i, j] = minx[i, j]
 
-{{endfor}}
 
-
-ctypedef fused group_t:
-    float64_t
-    float32_t
-    int64_t
+group_min_float64 = group_min["float64_t"]
+group_min_float32 = group_min["float32_t"]
+group_min_int64 = group_min["int64_t"]
 
 
 @cython.boundscheck(False)
diff --git a/pandas/_libs/join_helper.pxi.in b/pandas/_libs/join_helper.pxi.in
index feb8cfb76a7f0..3b84edc1c60d3 100644
--- a/pandas/_libs/join_helper.pxi.in
+++ b/pandas/_libs/join_helper.pxi.in
@@ -8,24 +8,13 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
 # left_join_indexer, inner_join_indexer, outer_join_indexer
 #----------------------------------------------------------------------
 
-{{py:
-
-# name, c_type, dtype
-dtypes = [('float64', 'float64_t', 'np.float64'),
-          ('float32', 'float32_t', 'np.float32'),
-          ('object', 'object', 'object'),
-          ('int32', 'int32_t', 'np.int32'),
-          ('int64', 'int64_t', 'np.int64'),
-          ('uint64', 'uint64_t', 'np.uint64')]
-
-def get_dispatch(dtypes):
-
-    for name, c_type, dtype in dtypes:
-        yield name, c_type, dtype
-
-}}
-
-{{for name, c_type, dtype in get_dispatch(dtypes)}}
+ctypedef fused join_t:
+    float64_t
+    float32_t
+    object
+    int32_t
+    int64_t
+    uint64_t
 
 # Joins on ordered, unique indices
 
@@ -34,12 +23,11 @@ def get_dispatch(dtypes):
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def left_join_indexer_unique_{{name}}(ndarray[{{c_type}}] left,
-                                      ndarray[{{c_type}}] right):
+def left_join_indexer_unique(ndarray[join_t] left, ndarray[join_t] right):
     cdef:
         Py_ssize_t i, j, nleft, nright
         ndarray[int64_t] indexer
-        {{c_type}} lval, rval
+        join_t lval, rval
 
     i = 0
     j = 0
@@ -78,6 +66,37 @@ def left_join_indexer_unique_{{name}}(ndarray[{{c_type}}] left,
     return indexer
 
 
+left_join_indexer_unique_float64 = left_join_indexer_unique["float64_t"]
+left_join_indexer_unique_float32 = left_join_indexer_unique["float32_t"]
+left_join_indexer_unique_object = left_join_indexer_unique["object"]
+left_join_indexer_unique_int32 = left_join_indexer_unique["int32_t"]
+left_join_indexer_unique_int64 = left_join_indexer_unique["int64_t"]
+left_join_indexer_unique_uint64 = left_join_indexer_unique["uint64_t"]
+
+{{py:
+
+# name, c_type, dtype
+dtypes = [('float64', 'float64_t', 'np.float64'),
+          ('float32', 'float32_t', 'np.float32'),
+          ('object', 'object', 'object'),
+          ('int32', 'int32_t', 'np.int32'),
+          ('int64', 'int64_t', 'np.int64'),
+          ('uint64', 'uint64_t', 'np.uint64')]
+
+def get_dispatch(dtypes):
+
+    for name, c_type, dtype in dtypes:
+        yield name, c_type, dtype
+
+}}
+
+{{for name, c_type, dtype in get_dispatch(dtypes)}}
+
+# Joins on ordered, unique indices
+
+# right might contain non-unique values
+
+
 # @cython.wraparound(False)
 # @cython.boundscheck(False)
 def left_join_indexer_{{name}}(ndarray[{{c_type}}] left,