Major improvement to the performance of filter.

mchav · mchav · commit 4218a15fe3d4 · 2025-07-16T03:06:11.000-07:00
Rather than building a set (which is both memory and compute inefficient) use findIndices to identify all the indexes that satisfy a predicate then select those in the filter.
diff --git a/benchmark/Main.hs b/benchmark/Main.hs
@@ -6,10 +6,12 @@ import qualified Data.Vector.Unboxed as VU
 
 import Control.Monad (replicateM)
 import Criterion.Main
+import Data.Time
 import System.Random (randomRIO)
 
 stats :: Int -> IO ()
 stats n = do
+  startTime <- getCurrentTime
   ns <- do
     ns' <- VU.replicateM n (randomRIO (-20.0 :: Double, 20.0))
     pure $ replicate 3 ns'
@@ -19,6 +21,9 @@ stats n = do
   print $ D.variance "1" df
   print $ D.correlation "1" "2" df
   print $ D.filter "0" (>= (19.9 :: Double)) df D.|> D.take 10
+  endTime <- getCurrentTime
+  let diff = diffUTCTime endTime startTime
+  putStrLn $ "Execution Time: " ++ (show diff)
 
 main = defaultMain [
   bgroup "stats" [ bench     "300_000" $ nfIO (stats 100_000)
diff --git a/dataframe.cabal b/dataframe.cabal
@@ -22,7 +22,7 @@ source-repository head
   location: https://github.com/mchav/dataframe
 
 library
-    default-extensions: StrictData
+    -- default-extensions: StrictData
     exposed-modules: DataFrame,
                      DataFrame.Lazy
     other-modules: DataFrame.Internal.Types,
@@ -113,6 +113,7 @@ benchmark dataframe-benchmark
     build-depends: base >= 4.17.2.0 && < 4.22,
                    criterion >= 1 && <= 1.6.4.0,
                    text >= 2.0 && <= 2.1.2,
+                   time >= 1.12,
                    random >= 1 && <= 1.3.1,
                    vector ^>= 0.13,
                    dataframe
diff --git a/src/DataFrame/Internal/Column.hs b/src/DataFrame/Internal/Column.hs
@@ -378,6 +378,26 @@ getIndicesUnboxed :: (VU.Unbox a) => VU.Vector Int -> VU.Vector a -> VU.Vector a
 getIndicesUnboxed indices xs = VU.generate (VU.length indices) (\i -> xs VU.! (indices VU.! i))
 {-# INLINE getIndicesUnboxed #-}
 
+findIndices :: forall a. (Columnable a)
+            => (a -> Bool)
+            -> Column
+            -> Maybe (VU.Vector Int)
+findIndices pred (BoxedColumn (column :: VB.Vector b)) = do
+  Refl <- testEquality (typeRep @a) (typeRep @b)
+  pure $ VG.convert (VG.findIndices pred column)
+findIndices pred (UnboxedColumn (column :: VU.Vector b)) = do
+  Refl <- testEquality (typeRep @a) (typeRep @b)
+  pure $ VG.findIndices pred column
+findIndices pred (OptionalColumn (column :: VB.Vector (Maybe b))) = do
+  Refl <- testEquality (typeRep @a) (typeRep @(Maybe b))
+  pure $ VG.convert (VG.findIndices pred column)
+findIndices pred (GroupedBoxedColumn (column :: VB.Vector b)) = do
+  Refl <- testEquality (typeRep @a) (typeRep @b)
+  pure $ VG.convert (VG.findIndices pred column)
+findIndices pred (GroupedUnboxedColumn (column :: VB.Vector b)) = do
+  Refl <- testEquality (typeRep @a) (typeRep @b)
+  pure $ VG.convert (VG.findIndices pred column)
+
 -- | An internal function that returns a vector of how indexes change after a column is sorted.
 sortedIndexes :: Bool -> Column -> VU.Vector Int
 sortedIndexes asc (BoxedColumn column ) = runST $ do
diff --git a/src/DataFrame/Operations/Subset.hs b/src/DataFrame/Operations/Subset.hs
@@ -78,16 +78,16 @@ filter ::
   DataFrame
 filter filterColumnName condition df = case getColumn filterColumnName df of
   Nothing -> throw $ ColumnNotFoundException filterColumnName "filter" (map fst $ M.toList $ columnIndices df)
-  Just column -> case ifoldlColumn (\s i v -> if condition v then S.insert i s else s) S.empty column of
+  Just column -> case findIndices condition column of
     Nothing -> throw $ TypeMismatchException (MkTypeErrorContext
                                                         { userType = Right $ typeRep @a
                                                         , expectedType = Left (columnTypeString column) :: Either String (TypeRep ()) 
                                                         , errorColumnName = Just (T.unpack filterColumnName)
                                                         , callingFunctionName = Just "filter"})
     Just indexes -> let
         c' = snd $ dataframeDimensions df
-        pick idxs col = atIndices idxs col
-      in df {columns = V.map (pick indexes) (columns df), dataframeDimensions = (S.size indexes, c')}
+        pick idxs col = atIndicesStable idxs col
+      in df {columns = V.map (pick indexes) (columns df), dataframeDimensions = (VG.length indexes, c')}
 
 -- | O(k) a version of filter where the predicate comes first.
 --
@@ -102,7 +102,7 @@ filterBy = flip filter
 filterWhere :: Expr Bool -> DataFrame -> DataFrame
 filterWhere expr df = let
     (TColumn col) = interpret @Bool df expr
-    (Just indexes) = VU.convert . V.map (fromMaybe 0) . V.filter isJust . toVector @(Maybe Int) <$> imapColumn (\i satisfied -> if satisfied then Just i else Nothing) col
+    (Just indexes) = findIndices (==True) col
     c' = snd $ dataframeDimensions df
     pick idxs col = atIndicesStable idxs col
   in df {columns = V.map (pick indexes) (columns df), dataframeDimensions = (VU.length indexes, c')}