Skip to content

Commit 7f8a95b

Browse files
committed
Change internal representation of dataframe to not have "holes" to be filled.
Since the data structure is iimmutable this is largely pointless.
1 parent cd209f3 commit 7f8a95b

21 files changed

Lines changed: 207 additions & 266 deletions

File tree

benchmark/Main.hs

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,21 +10,19 @@ import System.Random (randomRIO)
1010

1111
stats :: Int -> IO ()
1212
stats n = do
13-
ns <- VU.replicateM n (randomRIO (-20.0 :: Double, 20.0))
14-
xs <- VU.replicateM n (randomRIO (-20.0 :: Double, 20.0))
15-
ys <- VU.replicateM n (randomRIO (-20.0 :: Double, 20.0))
16-
let df = D.fromNamedColumns [("first", D.UnboxedColumn ns),
17-
("second", D.UnboxedColumn xs),
18-
("third", D.UnboxedColumn ys)]
13+
ns <- do
14+
ns' <- VU.replicateM n (randomRIO (-20.0 :: Double, 20.0))
15+
pure $ replicate 3 ns'
16+
let df = D.fromUnamedColumns (map D.fromUnboxedVector ns)
1917

20-
print $ D.mean "first" df
21-
print $ D.variance "second" df
22-
print $ D.correlation "second" "third" df
23-
print $ D.select ["first"] df D.|> D.take 1
18+
print $ D.mean "0" df
19+
print $ D.variance "1" df
20+
print $ D.correlation "1" "2" df
21+
print $ D.filter "0" (>= (19.9 :: Double)) df D.|> D.take 10
2422

2523
main = defaultMain [
26-
bgroup "stats" [ bench "300_000" $ nfIO (stats 100_000)
24+
bgroup "stats" [ bench "300_000" $ toBenchmarkable (stats 100_000)
2725
, bench "3_000_000" $ nfIO (stats 1_000_000)
28-
, bench "30_000_000" $ nfIO (stats 30_000_000)
26+
, bench "300_000_000" $ nfIO (stats 300_000_000)
2927
]
3028
]

dataframe.cabal

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ source-repository head
2222
location: https://github.com/mchav/dataframe
2323

2424
library
25+
default-extensions: StrictData
2526
exposed-modules: DataFrame,
2627
DataFrame.Lazy
2728
other-modules: DataFrame.Internal.Types,

src/DataFrame/Display/Terminal/Plot.hs

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -55,33 +55,32 @@ plotHistogramsBy col plotSet orientation df = do
5555
plotForColumnBy col cname byColumn plotColumn orientation df
5656

5757
-- Plot code adapted from: https://alexwlchan.net/2018/ascii-bar-charts/
58-
plotForColumnBy :: HasCallStack => T.Text -> T.Text -> Maybe Column -> Maybe Column -> HistogramOrientation -> DataFrame -> IO ()
59-
plotForColumnBy _ _ Nothing _ _ _ = return ()
60-
plotForColumnBy byCol cname (Just (BoxedColumn (byColumn :: V.Vector a))) (Just (BoxedColumn (plotColumn :: V.Vector b))) orientation df = do
58+
plotForColumnBy :: HasCallStack => T.Text -> T.Text -> Column -> Column -> HistogramOrientation -> DataFrame -> IO ()
59+
plotForColumnBy byCol cname (BoxedColumn (byColumn :: V.Vector a)) (BoxedColumn (plotColumn :: V.Vector b)) orientation df = do
6160
let zipped = VG.zipWith (\left right -> (show left, show right)) plotColumn byColumn
6261
let counts = countOccurrences zipped
6362
if null counts || length counts > 20
6463
then pure ()
6564
else case orientation of
6665
VerticalHistogram -> error "Vertical histograms aren't yet supported"
6766
HorizontalHistogram -> plotGivenCounts' cname counts
68-
plotForColumnBy byCol cname (Just (UnboxedColumn byColumn)) (Just (BoxedColumn plotColumn)) orientation df = do
67+
plotForColumnBy byCol cname (UnboxedColumn byColumn) (BoxedColumn plotColumn) orientation df = do
6968
let zipped = VG.zipWith (\left right -> (show left, show right)) plotColumn (V.convert byColumn)
7069
let counts = countOccurrences zipped
7170
if null counts || length counts > 20
7271
then pure ()
7372
else case orientation of
7473
VerticalHistogram -> error "Vertical histograms aren't yet supported"
7574
HorizontalHistogram -> plotGivenCounts' cname counts
76-
plotForColumnBy byCol cname (Just (BoxedColumn byColumn)) (Just (UnboxedColumn plotColumn)) orientation df = do
75+
plotForColumnBy byCol cname (BoxedColumn byColumn) (UnboxedColumn plotColumn) orientation df = do
7776
let zipped = VG.zipWith (\left right -> (show left, show right)) (V.convert plotColumn) (V.convert byColumn)
7877
let counts = countOccurrences zipped
7978
if null counts || length counts > 20
8079
then pure ()
8180
else case orientation of
8281
-- VerticalHistogram -> plotVerticalGivenCounts cname counts
8382
HorizontalHistogram -> plotGivenCounts' cname counts
84-
plotForColumnBy byCol cname (Just (UnboxedColumn byColumn)) (Just (UnboxedColumn plotColumn)) orientation df = do
83+
plotForColumnBy byCol cname (UnboxedColumn byColumn) (UnboxedColumn plotColumn) orientation df = do
8584
let zipped = VG.zipWith (\left right -> (show left, show right)) (V.convert plotColumn) (V.convert byColumn)
8685
let counts = countOccurrences zipped
8786
if null counts || length counts > 20
@@ -93,9 +92,8 @@ plotForColumnBy byCol cname (Just (UnboxedColumn byColumn)) (Just (UnboxedColumn
9392
plotForColumnBy _ _ _ _ _ _ = return ()
9493

9594
-- Plot code adapted from: https://alexwlchan.net/2018/ascii-bar-charts/
96-
plotForColumn :: HasCallStack => T.Text -> Maybe Column -> HistogramOrientation -> DataFrame -> IO ()
97-
plotForColumn _ Nothing _ _ = return ()
98-
plotForColumn cname (Just (BoxedColumn (column :: V.Vector a))) orientation df = do
95+
plotForColumn :: HasCallStack => T.Text -> Column -> HistogramOrientation -> DataFrame -> IO ()
96+
plotForColumn cname (BoxedColumn (column :: V.Vector a)) orientation df = do
9997
let repa :: Ref.TypeRep a = Ref.typeRep @a
10098
repText :: Ref.TypeRep T.Text = Ref.typeRep @T.Text
10199
repString :: Ref.TypeRep String = Ref.typeRep @String
@@ -110,7 +108,7 @@ plotForColumn cname (Just (BoxedColumn (column :: V.Vector a))) orientation df =
110108
else case orientation of
111109
VerticalHistogram -> plotVerticalGivenCounts cname counts
112110
HorizontalHistogram -> plotGivenCounts cname counts
113-
plotForColumn cname (Just (UnboxedColumn (column :: VU.Vector a))) orientation df = do
111+
plotForColumn cname (UnboxedColumn (column :: VU.Vector a)) orientation df = do
114112
let repa :: Ref.TypeRep a = Ref.typeRep @a
115113
repText :: Ref.TypeRep T.Text = Ref.typeRep @T.Text
116114
repString :: Ref.TypeRep String = Ref.typeRep @String

src/DataFrame/IO/CSV.hs

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,10 @@
1-
{-# LANGUAGE BangPatterns #-}
21
{-# LANGUAGE ExplicitNamespaces #-}
32
{-# LANGUAGE LambdaCase #-}
43
{-# LANGUAGE OverloadedStrings #-}
54
{-# LANGUAGE ScopedTypeVariables #-}
65
{-# LANGUAGE TypeApplications #-}
76
{-# LANGUAGE GADTs #-}
87
{-# LANGUAGE RankNTypes #-}
9-
{-# LANGUAGE Strict #-}
108
module DataFrame.IO.CSV where
119

1210
import qualified Data.ByteString.Char8 as C
@@ -104,9 +102,8 @@ readSeparated c opts path = do
104102
cols <- V.mapM (freezeColumn mutableCols nulls' opts) (V.generate numColumns id)
105103
return $ DataFrame {
106104
columns = cols,
107-
freeIndices = [],
108105
columnIndices = M.fromList (zip columnNames [0..]),
109-
dataframeDimensions = (maybe 0 columnLength (cols V.! 0), V.length cols)
106+
dataframeDimensions = (maybe 0 columnLength (cols V.!? 0), V.length cols)
110107
}
111108
{-# INLINE readSeparated #-}
112109

@@ -160,10 +157,10 @@ writeValue mutableCols nullIndices count colIndex value = do
160157
{-# INLINE writeValue #-}
161158

162159
-- | Freezes a mutable vector into an immutable one, trimming it to the actual row count.
163-
freezeColumn :: VM.IOVector Column -> V.Vector [(Int, T.Text)] -> ReadOptions -> Int -> IO (Maybe Column)
160+
freezeColumn :: VM.IOVector Column -> V.Vector [(Int, T.Text)] -> ReadOptions -> Int -> IO Column
164161
freezeColumn mutableCols nulls opts colIndex = do
165162
col <- VM.unsafeRead mutableCols colIndex
166-
Just <$> freezeColumn' (nulls V.! colIndex) col
163+
freezeColumn' (nulls V.! colIndex) col
167164
{-# INLINE freezeColumn #-}
168165

169166
parseSep :: Char -> T.Text -> [T.Text]
@@ -215,7 +212,7 @@ lineEnd =
215212
countRows :: Char -> FilePath -> IO Int
216213
countRows c path = withFile path ReadMode $! go 0 ""
217214
where
218-
go !n !input h = do
215+
go n input h = do
219216
isEOF <- hIsEOF h
220217
if isEOF && input == mempty
221218
then pure n
@@ -250,8 +247,7 @@ getRowAsText :: DataFrame -> Int -> [T.Text]
250247
getRowAsText df i = V.ifoldr go [] (columns df)
251248
where
252249
indexMap = M.fromList (map (\(a, b) -> (b, a)) $ M.toList (columnIndices df))
253-
go k Nothing acc = acc
254-
go k (Just (BoxedColumn (c :: V.Vector a))) acc = case c V.!? i of
250+
go k (BoxedColumn (c :: V.Vector a)) acc = case c V.!? i of
255251
Just e -> textRep : acc
256252
where textRep = case testEquality (typeRep @a) (typeRep @T.Text) of
257253
Just Refl -> e
@@ -272,7 +268,7 @@ getRowAsText df i = V.ifoldr go [] (columns df)
272268
++ " has less items than "
273269
++ "the other columns at index "
274270
++ show i
275-
go k (Just (UnboxedColumn c)) acc = case c VU.!? i of
271+
go k (UnboxedColumn c) acc = case c VU.!? i of
276272
Just e -> T.pack (show e) : acc
277273
Nothing ->
278274
error $
@@ -281,7 +277,7 @@ getRowAsText df i = V.ifoldr go [] (columns df)
281277
++ " has less items than "
282278
++ "the other columns at index "
283279
++ show i
284-
go k (Just (OptionalColumn (c :: V.Vector (Maybe a)))) acc = case c V.!? i of
280+
go k (OptionalColumn (c :: V.Vector (Maybe a))) acc = case c V.!? i of
285281
Just e -> textRep : acc
286282
where textRep = case testEquality (typeRep @a) (typeRep @T.Text) of
287283
Just Refl -> fromMaybe "Nothing" e

src/DataFrame/Internal/Column.hs

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
{-# LANGUAGE OverloadedStrings #-}
55
{-# LANGUAGE RankNTypes #-}
66
{-# LANGUAGE ScopedTypeVariables #-}
7-
{-# LANGUAGE Strict #-}
87
{-# LANGUAGE TypeApplications #-}
98
{-# LANGUAGE FlexibleContexts #-}
109
{-# LANGUAGE FlexibleInstances #-}
@@ -254,17 +253,19 @@ fromList ::
254253
=> [a] -> Column
255254
fromList = toColumnRep @(KindOf a) . VB.fromList
256255

257-
256+
-- | Type-level boolean for constraint/type comparison.
258257
data SBool (b :: Bool) where
259258
STrue :: SBool 'True
260259
SFalse :: SBool 'False
261260

261+
-- | The runtime witness for our type-level branching.
262262
class SBoolI (b :: Bool) where
263-
sbool :: SBool b -- the run-time witness
263+
sbool :: SBool b
264264

265265
instance SBoolI 'True where sbool = STrue
266266
instance SBoolI 'False where sbool = SFalse
267267

268+
-- | Type-level function to determine whether or not a type is unboxa
268269
sUnbox :: forall a. SBoolI (Unboxable a) => SBool (Unboxable a)
269270
sUnbox = sbool @(Unboxable a)
270271

@@ -564,27 +565,37 @@ freezeColumn' nulls (MutableUnboxedColumn col)
564565

565566
-- | Fills the end of a column, up to n, with Nothing. Does nothing if column has length greater than n.
566567
expandColumn :: Int -> Column -> Column
567-
expandColumn n (OptionalColumn col) = OptionalColumn $ col <> VB.replicate n Nothing
568+
expandColumn n (OptionalColumn col) = OptionalColumn $ col <> VB.replicate (n - VG.length col) Nothing
568569
expandColumn n column@(BoxedColumn col)
569-
| n < VG.length col = OptionalColumn $ VB.map Just col <> VB.replicate n Nothing
570+
| n > VG.length col = OptionalColumn $ VB.map Just col <> VB.replicate (n - VG.length col) Nothing
570571
| otherwise = column
571572
expandColumn n column@(UnboxedColumn col)
572-
| n < VG.length col = OptionalColumn $ VB.map Just (VU.convert col) <> VB.replicate n Nothing
573+
| n > VG.length col = OptionalColumn $ VB.map Just (VU.convert col) <> VB.replicate (n - VG.length col) Nothing
573574
| otherwise = column
574575
expandColumn n column@(GroupedBoxedColumn col)
575-
| n < VG.length col = GroupedBoxedColumn $ col <> VB.replicate n VB.empty
576+
| n > VG.length col = GroupedBoxedColumn $ col <> VB.replicate (n - VG.length col) VB.empty
576577
| otherwise = column
577578
expandColumn n column@(GroupedUnboxedColumn col)
578-
| n < VG.length col = GroupedUnboxedColumn $ col <> VB.replicate n VU.empty
579+
| n > VG.length col = GroupedUnboxedColumn $ col <> VB.replicate (n - VG.length col) VU.empty
579580
| otherwise = column
580581

581582
-- | Fills the beginning of a column, up to n, with Nothing. Does nothing if column has length greater than n.
582583
leftExpandColumn :: Int -> Column -> Column
583-
leftExpandColumn n (OptionalColumn col) = OptionalColumn $ VB.replicate n Nothing <> col
584-
leftExpandColumn n (BoxedColumn col) = OptionalColumn $ VB.replicate n Nothing <> VB.map Just col
585-
leftExpandColumn n (UnboxedColumn col) = OptionalColumn $ VB.replicate n Nothing <> VB.map Just (VU.convert col)
586-
leftExpandColumn n (GroupedBoxedColumn col) = GroupedBoxedColumn $ VB.replicate n VB.empty <> col
587-
leftExpandColumn n (GroupedUnboxedColumn col) = GroupedUnboxedColumn $ VB.replicate n VU.empty <> col
584+
leftExpandColumn n column@(OptionalColumn col)
585+
| n > VG.length col = OptionalColumn $ VG.replicate (n - VG.length col) Nothing <> col
586+
| otherwise = column
587+
leftExpandColumn n column@(BoxedColumn col)
588+
| n > VG.length col = OptionalColumn $ VG.replicate (n - VG.length col) Nothing <> VG.map Just col
589+
| otherwise = column
590+
leftExpandColumn n column@(UnboxedColumn col)
591+
| n > VG.length col = OptionalColumn $ VG.replicate (n - VG.length col) Nothing <> VG.map Just (VU.convert col)
592+
| otherwise = column
593+
leftExpandColumn n column@(GroupedBoxedColumn col)
594+
| n > VG.length col = GroupedBoxedColumn $ VG.replicate (n - VG.length col) VB.empty <> col
595+
| otherwise = column
596+
leftExpandColumn n column@(GroupedUnboxedColumn col)
597+
| n > VG.length col = GroupedUnboxedColumn $ VG.replicate (n - VG.length col) VU.empty <> col
598+
| otherwise = column
588599

589600
-- | Concatenates two columns.
590601
concatColumns :: Column -> Column -> Maybe Column

src/DataFrame/Internal/DataFrame.hs

Lines changed: 11 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
{-# LANGUAGE ScopedTypeVariables #-}
55
{-# LANGUAGE TypeApplications #-}
66
{-# LANGUAGE GADTs #-}
7-
{-# LANGUAGE Strict #-}
87
{-# LANGUAGE FlexibleContexts #-}
98
module DataFrame.Internal.DataFrame where
109

@@ -25,11 +24,9 @@ import Type.Reflection (typeRep)
2524
data DataFrame = DataFrame
2625
{ -- | Our main data structure stores a dataframe as
2726
-- a vector of columns. This improv
28-
columns :: V.Vector (Maybe Column),
27+
columns :: V.Vector Column,
2928
-- | Keeps the column names in the order they were inserted in.
3029
columnIndices :: M.Map T.Text Int,
31-
-- | Next free index that we insert a column into.
32-
freeIndices :: [Int],
3330
dataframeDimensions :: (Int, Int)
3431
}
3532

@@ -46,13 +43,12 @@ asText :: DataFrame -> Bool -> T.Text
4643
asText d properMarkdown =
4744
let header = "index" : map fst (sortBy (compare `on` snd) $ M.toList (columnIndices d))
4845
types = V.toList $ V.filter (/= "") $ V.map getType (columns d)
49-
getType :: Maybe Column -> T.Text
50-
getType Nothing = ""
51-
getType (Just (BoxedColumn (column :: V.Vector a))) = T.pack $ show (typeRep @a)
52-
getType (Just (UnboxedColumn (column :: VU.Vector a))) = T.pack $ show (typeRep @a)
53-
getType (Just (OptionalColumn (column :: V.Vector a))) = T.pack $ show (typeRep @a)
54-
getType (Just (GroupedBoxedColumn (column :: V.Vector a))) = T.pack $ show (typeRep @a)
55-
getType (Just (GroupedUnboxedColumn (column :: V.Vector a))) = T.pack $ show (typeRep @a)
46+
getType :: Column -> T.Text
47+
getType (BoxedColumn (column :: V.Vector a)) = T.pack $ show (typeRep @a)
48+
getType (UnboxedColumn (column :: VU.Vector a)) = T.pack $ show (typeRep @a)
49+
getType (OptionalColumn (column :: V.Vector a)) = T.pack $ show (typeRep @a)
50+
getType (GroupedBoxedColumn (column :: V.Vector a)) = T.pack $ show (typeRep @a)
51+
getType (GroupedUnboxedColumn (column :: V.Vector a)) = T.pack $ show (typeRep @a)
5652
-- Separate out cases dynamically so we don't end up making round trip string
5753
-- copies.
5854
get :: Maybe Column -> V.Vector T.Text
@@ -67,32 +63,22 @@ asText d properMarkdown =
6763
get (Just (GroupedUnboxedColumn column)) = V.map (T.pack . show) column
6864
getTextColumnFromFrame df (i, name) = if i == 0
6965
then V.fromList (map (T.pack . show) [0..(fst (dataframeDimensions df) - 1)])
70-
else get $ (V.!) (columns d) ((M.!) (columnIndices d) name)
66+
else get $ (V.!?) (columns d) ((M.!) (columnIndices d) name)
7167
rows =
7268
transpose $
7369
zipWith (curry (V.toList . getTextColumnFromFrame d)) [0..] header
7470
in (if properMarkdown then showTableProperMarkdown else showTable) header ("Int":types) rows
7571

7672
-- | O(1) Creates an empty dataframe
7773
empty :: DataFrame
78-
empty = DataFrame {columns = V.replicate initialColumnSize Nothing,
74+
empty = DataFrame {columns = V.empty,
7975
columnIndices = M.empty,
80-
freeIndices = [0..(initialColumnSize - 1)],
8176
dataframeDimensions = (0, 0) }
8277

83-
initialColumnSize :: Int
84-
initialColumnSize = 8
85-
8678
getColumn :: T.Text -> DataFrame -> Maybe Column
8779
getColumn name df = do
8880
i <- columnIndices df M.!? name
89-
join $ columns df V.!? i
81+
columns df V.!? i
9082

9183
null :: DataFrame -> Bool
92-
null df = dataframeDimensions df == (0, 0)
93-
94-
metadata :: DataFrame -> String
95-
metadata df = show (columnIndices df) ++ "\n" ++
96-
show (V.map (fmap columnVersionString) (columns df)) ++ "\n" ++
97-
show (freeIndices df) ++ "\n" ++
98-
show (dataframeDimensions df)
84+
null df = V.null (columns df)

src/DataFrame/Internal/Expression.hs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
{-# LANGUAGE OverloadedStrings #-}
55
{-# LANGUAGE RankNTypes #-}
66
{-# LANGUAGE ScopedTypeVariables #-}
7-
{-# LANGUAGE StrictData #-}
87
{-# LANGUAGE TypeApplications #-}
98
{-# LANGUAGE FlexibleContexts #-}
109
{-# LANGUAGE FlexibleInstances #-}

src/DataFrame/Internal/Parsing.hs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
{-# LANGUAGE OverloadedStrings #-}
2-
{-# LANGUAGE Strict #-}
32
module DataFrame.Internal.Parsing where
43

54
import qualified Data.ByteString.Char8 as C

src/DataFrame/Internal/Types.hs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
{-# LANGUAGE ScopedTypeVariables #-}
99
{-# LANGUAGE TypeApplications #-}
1010
{-# LANGUAGE TypeOperators #-}
11-
{-# LANGUAGE Strict #-}
1211
module DataFrame.Internal.Types where
1312

1413
import Data.Int ( Int8, Int16, Int32, Int64 )

0 commit comments

Comments
 (0)