@@ -556,6 +556,26 @@ def offset_labels(labels: np.ndarray, ngroups: int) -> tuple[np.ndarray, int]:
556
556
return offset , size
557
557
558
558
559
+ def fast_isin (ar1 , ar2 , invert ):
560
+ rev_idx , ar1 = pd .factorize (ar1 , sort = False )
561
+
562
+ ar = np .concatenate ((ar1 , ar2 ))
563
+ # We need this to be a stable sort, so always use 'mergesort'
564
+ # here. The values from the first array should always come before
565
+ # the values from the second array.
566
+ order = ar .argsort (kind = "mergesort" )
567
+ sar = ar [order ]
568
+ if invert :
569
+ bool_ar = sar [1 :] != sar [:- 1 ]
570
+ else :
571
+ bool_ar = sar [1 :] == sar [:- 1 ]
572
+ flag = np .concatenate ((bool_ar , [invert ]))
573
+ ret = np .empty (ar .shape , dtype = bool )
574
+ ret [order ] = flag
575
+
576
+ return ret [rev_idx ]
577
+
578
+
559
579
@overload
560
580
def factorize_ (
561
581
by : T_Bys ,
@@ -654,14 +674,20 @@ def factorize_(
654
674
if expect is not None and reindex :
655
675
sorter = np .argsort (expect )
656
676
groups = expect [(sorter ,)] if sort else expect
657
- idx = np .searchsorted (expect , flat , sorter = sorter )
658
- mask = ~ np .isin (flat , expect ) | isnull (flat ) | (idx == len (expect ))
677
+
678
+ mask = fast_isin (flat , expect , invert = True )
679
+ if not np .issubdtype (flat .dtype , np .integer ):
680
+ mask |= isnull (flat )
681
+
682
+ idx = np .full (flat .shape , - 1 )
683
+ result = np .searchsorted (expect .values , flat [~ mask ], sorter = sorter )
684
+ idx [~ mask ] = result
685
+ # idx = np.searchsorted(expect.values, flat, sorter=sorter)
686
+ # idx[mask] = -1
659
687
if not sort :
660
688
# idx is the index in to the sorted array.
661
689
# if we didn't want sorting, unsort it back
662
- idx [(idx == len (expect ),)] = - 1
663
690
idx = sorter [(idx ,)]
664
- idx [mask ] = - 1
665
691
else :
666
692
idx , groups = pd .factorize (flat , sort = sort ) # type: ignore[arg-type]
667
693
0 commit comments