Skip to content

Commit 8c055d9

Browse files
committed
Various API updates:
* Rename the internal transform function to mapColumns. * Change the exception class to have a context that we can overwrite with the call-site. * Update tutorials
1 parent 373bc48 commit 8c055d9

21 files changed

Lines changed: 457 additions & 340 deletions

benchmark/Main.hs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,9 @@ stats n = do
1313
ns <- VU.replicateM n (randomRIO (-20.0 :: Double, 20.0))
1414
xs <- VU.replicateM n (randomRIO (-20.0 :: Double, 20.0))
1515
ys <- VU.replicateM n (randomRIO (-20.0 :: Double, 20.0))
16-
let df = D.fromList [("first", D.UnboxedColumn ns),
17-
("second", D.UnboxedColumn xs),
18-
("third", D.UnboxedColumn ys)]
16+
let df = D.fromNamedColumns [("first", D.UnboxedColumn ns),
17+
("second", D.UnboxedColumn xs),
18+
("third", D.UnboxedColumn ys)]
1919

2020
print $ D.mean "first" df
2121
print $ D.variance "second" df

docs/coming_from_pandas.md

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ dtype: float64
2626

2727
```haskell
2828
ghci> import qualified DataFrame as D
29-
ghci> D.toColumn [1, 3, 5, read @Float "NaN", 6, 8]
29+
ghci> D.fromList [1, 3, 5, read @Float "NaN", 6, 8]
3030
[1.0,3.0,5.0,NaN,6.0,8.0]
3131
```
3232

@@ -40,7 +40,7 @@ DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
4040

4141
```haskell
4242
ghci> import Data.Time.Calendar
43-
ghci> dates = D.toColumn $ Prelude.take 6 $ [fromGregorian 2013 01 01..]
43+
ghci> dates = D.fromList $ Prelude.take 6 $ [fromGregorian 2013 01 01..]
4444
ghci> dates
4545
[2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06]
4646
```
@@ -65,7 +65,7 @@ ghci> import System.Random (randomRIO)
6565
ghci> import Control.Monad (replicateM)
6666
ghci> import Data.List (foldl')
6767
ghci> :set -XOverloadedStrings
68-
ghci> initDf = D.fromList [("date", dates)]
68+
ghci> initDf = D.fromNamedColumns [("date", dates)]
6969
ghci> ns <- replicateM 4 (replicateM 6 (randomRIO (-2.0, 2.0)))
7070
ghci> df = foldl' (\d (name, col) -> D.insertColumn name (V.fromList col) d) initDf (zip ["A","B","C","D"] ns)
7171
ghci> df
@@ -82,7 +82,7 @@ index | date | A | B | C
8282
5 | 2013-01-06 | -0.5541246187320041 | -1.5791034339829042 | -1.5650415391333796 | -1.7802523632196152
8383
```
8484

85-
As hinted in the previous example we can create a dataframe with `fromList`. This function takes in a list of tuples. We don't broadast values like python does i.e if you put in a single value into a column all other values will be null/nothing. But we'll detail how to get the same functionality.
85+
As hinted in the previous example we can create a dataframe with `fromNamedColumns`. This function takes in a list of tuples. We don't broadast values like python does i.e if you put in a single value into a column all other values will be null/nothing. But we'll detail how to get the same functionality.
8686

8787
```python
8888
df2 = pd.DataFrame(
@@ -110,13 +110,13 @@ df2 = pd.DataFrame(
110110
-- All our data types must be printable and orderable.
111111
data Transport = Test | Train deriving (Show, Ord, Eq)
112112
ghci> :{
113-
ghci| df = D.fromList [
114-
ghci| ("A", D.toColumn (replicate 4 1.0)),
115-
ghci| ("B", D.toColumn (replicate 4 (fromGregorian 2013 01 02))),
116-
ghci| ("C", D.toColumn (replicate 4 (1.0 :: Float))),
117-
ghci| ("D", D.toColumn (replicate 4 (3 :: Int))),
118-
ghci| ("E", D.toColumn (take 4 $ cycle [Test, Train])),
119-
ghci| ("F", D.toColumn (replicate 4 "foo"))]
113+
ghci| df = D.fromNamedColumns [
114+
ghci| ("A", D.fromList (replicate 4 1.0)),
115+
ghci| ("B", D.fromList (replicate 4 (fromGregorian 2013 01 02))),
116+
ghci| ("C", D.fromList (replicate 4 (1.0 :: Float))),
117+
ghci| ("D", D.fromList (replicate 4 (3 :: Int))),
118+
ghci| ("E", D.fromList (take 4 $ cycle [Test, Train])),
119+
ghci| ("F", D.fromList (replicate 4 "foo"))]
120120
ghci|:}
121121
ghci> df
122122
--------------------------------------------------------------

docs/coming_from_polars.md

Lines changed: 21 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -42,16 +42,16 @@ import Data.Time.Calendar
4242
main :: IO
4343
main = do
4444
let df = D.fromList [
45-
("name", D.toColumn [ "Alice Archer"
45+
("name", D.fromList [ "Alice Archer"
4646
, "Ben Brown"
4747
, "Chloe Cooper"
4848
, "Daniel Donovan"])
49-
, ("birthdate", D.toColumn [ fromGregorian 1997 01 10
49+
, ("birthdate", D.fromList [ fromGregorian 1997 01 10
5050
, fromGregorian 1985 02 15
5151
, fromGregorian 1983 03 22
5252
, fromGregorian 1981 04 30])
53-
, ("weight", D.toColumn [57.9, 72.5, 53.6, 83.1])
54-
, ("height", D.toColumn [1.56, 1.77, 1.65, 1.75])]
53+
, ("weight", D.fromList [57.9, 72.5, 53.6, 83.1])
54+
, ("height", D.fromList [1.56, 1.77, 1.65, 1.75])]
5555
print df
5656
D.writeCsv "./data/output.csv" df
5757
let df_csv = D.readCsv "./data/output.csv"
@@ -88,12 +88,7 @@ Notice that the type of the string column changes from `[Char]` (Haskell's defau
8888

8989
## Expressions
9090

91-
Our equivalent to expressions is a tuple that contains a list of the column names followed by a
92-
function where the arguments correspond to the order of column names. We use a special function
93-
wrapper to make our dataframes accept functions with any number of arguments. This is done using
94-
the `func` function.
95-
96-
This is a mouthful and is probably easier to see in action/comparison.
91+
We support expressions similar to Polars and PySpark. These expressions help us write row-level computations.
9792

9893
For example:
9994

@@ -122,31 +117,8 @@ main = do
122117
...
123118
let year = (\(YearMonthDay y _ _) -> y)
124119
print $ df_csv
125-
|> D.derive "birth_year" (lift year (D.col @Day "birthdate"))
126-
|> D.derive "bmi" ((D.col @Double "weight") / (D.lift2 (**) (D.col @Double "height") (D.lit 2)))
127-
|> D.select ["name", "birth_year", "bmi"]
128-
```
129-
130-
Or, more clearly:
131-
132-
```haskell
133-
{-# LANGUAGE ScopedTypeVariables #-}
134-
{-# LANGUAGE TypeApplications #-}
135-
import qualified DataFrame as D
136-
import qualified Data.Text as T
137-
138-
import DataFrame ( (|>) )
139-
import Data.Time.Calendar
140-
141-
main :: IO ()
142-
main = do
143-
...
144-
let year = (\(YearMonthDay y _ _) -> y)
145-
let bmi :: Double -> Double -> Double
146-
bmi w h = w / h ** 2
147-
print $ df_csv
148-
|> D.derive "birth_year" (lift year (D.col @Day "birthdate"))
149-
|> D.derive "bmi" ((D.col @Double "weight") / (D.lift2 (**) (D.col @Double "height") (D.lit 2)))
120+
|> D.derive "birth_year" (D.lift year (D.col @Day "birthdate"))
121+
|> D.derive "bmi" ((D.col @Double "weight") / (D.col @Double "height" ** D.lit 2))
150122
|> D.select ["name", "birth_year", "bmi"]
151123
```
152124

@@ -164,12 +136,15 @@ index | name | birth_year | bmi
164136
3 | Daniel Donovan | 1981 | 27.13469387755102
165137
```
166138

167-
The dataframe implementation can be read top down. `apply` a function that gets the year to the `birthdate`;
168-
store the result in the `birth_year` column; combine `weight` and `height` into the bmi column using the
169-
formula `w / h ** 2`; then select the `name`, `birth_year` and `bmi` fields.
170139

171-
Dataframe focuses on splitting transformations into transformations on the whole dataframe so it's easily usable
172-
in a repl-like environment.
140+
The Haskell implementation can be read top down:
141+
* Create a column called `birth_year` by getting the year from the `birthdate` column.
142+
* Create a column called `bmi`which is computed as `weight / height ** 2`,
143+
* then select the `name`, `birth_year` and `bmi` fields.
144+
145+
`lift` takes a regular, unary (one argument) Haskell function and applied it to a column. To apply a binary function to two columns we use `lift2`.
146+
147+
The Polars column type can be a single column or a list of columns. This means that applying a single transformation to many columns can be written as follows:
173148

174149
In the example Polars expression expansion example:
175150

@@ -181,7 +156,7 @@ result = df.select(
181156
print(result)
182157
```
183158

184-
We instead write this two `applyWithAlias` calls:
159+
In Haskell, we don't provide a way of doing this out of the box. So you'd have to write something more explicit:
185160

186161
```haskell
187162
df_csv
@@ -202,7 +177,7 @@ index | name | height-5% | weight-5%
202177
3 | Daniel Donovan | 1.6624999999999999 | 78.945
203178
```
204179

205-
However we can make our program shorter by using regular Haskell and folding over the dataframe.
180+
We can use standard Haskell machinery to make the program short without sactificing readability.
206181

207182
```haskell
208183
let reduce name = D.derive (name <> "-5%") ((col @Double name) * (lit 0.95))
@@ -211,16 +186,19 @@ df_csv
211186
|> D.select ["name", "weight-5%", "height-5%"]
212187
```
213188

214-
Or alternatively,
189+
Or alternatively, if our transformation only involves the variable we are modifying we can write the same code as follows:
215190

216191
```haskell
217192
addSuffix suffix name = D.rename name (name <> suffix)
218193
df_csv
219194
|> D.applyMany ["weight", "height"] (*0.95)
195+
-- We have to rename the fields so they match what we had before.
220196
|> D.fold (addSuffix "-5%")
221197
|> D.select ["name", "weight-5%", "height-5%"]
222198
```
223199

200+
This means that we can still rely on the expressive power of Haskell itself without relying entirely on the column expressions. This keeps our implementation more flexible.
201+
224202
Filtering looks much the same:
225203

226204
```python

docs/haskell_for_data_analysis.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ In Haskell, potentially missing values are represented by a "wrapper" type calle
1010

1111
```
1212
ghci> import qualified DataFrame as D
13-
ghci> let df = D.fromColumnList [D.toColumn [Just 1, Just 1, Nothing, Nothing], D.toColumn [Just 6.5, Nothing, Nothing, Just 6.5], D.toColumn [Just 3.0, Nothing, Nothing, Just 3.0]]
13+
ghci> let df = D.fromColumnList [D.fromList[Just 1, Just 1, Nothing, Nothing], D.fromList[Just 6.5, Nothing, Nothing, Just 6.5], D.fromList[Just 3.0, Nothing, Nothing, Just 3.0]]
1414
ghci> df
1515
---------------------------------------------------
1616
index | 0 | 1 | 2

src/DataFrame/Errors.hs

Lines changed: 27 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -10,30 +10,30 @@ import qualified Data.Text as T
1010

1111
import Control.Exception
1212
import Data.Array
13+
import Data.Either
1314
import DataFrame.Display.Terminal.Colours
1415
import Data.Typeable (Typeable)
1516
import Type.Reflection (TypeRep)
1617

18+
data TypeErrorContext a b = MkTypeErrorContext
19+
{ userType :: Either String (TypeRep a)
20+
, expectedType :: Either String (TypeRep b)
21+
, errorColumnName :: Maybe String
22+
, callingFunctionName :: Maybe String
23+
}
24+
1725
data DataFrameException where
1826
TypeMismatchException :: forall a b. (Typeable a, Typeable b)
19-
=> TypeRep a -- ^ given type
20-
-> TypeRep b -- ^ expected type
21-
-> T.Text -- ^ column name
22-
-> T.Text -- ^ call point
27+
=> TypeErrorContext a b
2328
-> DataFrameException
24-
TypeMismatchException' :: forall a . (Typeable a)
25-
=> TypeRep a -- ^ given type
26-
-> String -- ^ expected type
27-
-> T.Text -- ^ column name
28-
-> T.Text -- ^ call point
29-
-> DataFrameException
3029
ColumnNotFoundException :: T.Text -> T.Text -> [T.Text] -> DataFrameException
3130
deriving (Exception)
3231

3332
instance Show DataFrameException where
3433
show :: DataFrameException -> String
35-
show (TypeMismatchException a b columnName callPoint) = addCallPointInfo columnName (Just callPoint) (typeMismatchError a b)
36-
show (TypeMismatchException' a b columnName callPoint) = addCallPointInfo columnName (Just callPoint) (typeMismatchError' (show a) b)
34+
show (TypeMismatchException context) = let
35+
errorString = typeMismatchError (either id show (userType context)) (either id show (expectedType context))
36+
in addCallPointInfo (errorColumnName context) (callingFunctionName context) errorString
3737
show (ColumnNotFoundException columnName callPoint availableColumns) = columnNotFound columnName callPoint availableColumns
3838

3939
columnNotFound :: T.Text -> T.Text -> [T.Text] -> String
@@ -47,51 +47,46 @@ columnNotFound name callPoint columns =
4747
++ T.unpack (guessColumnName name columns)
4848
++ "?\n\n"
4949

50-
typeMismatchError ::
51-
Type.Reflection.TypeRep a ->
52-
Type.Reflection.TypeRep b ->
53-
String
54-
typeMismatchError a b = typeMismatchError' (show a) (show b)
55-
56-
typeMismatchError' :: String -> String -> String
57-
typeMismatchError' givenType expectedType =
50+
typeMismatchError :: String -> String -> String
51+
typeMismatchError givenType expectedType =
5852
red $
5953
red "\n\n[Error]: Type Mismatch"
6054
++ "\n\tWhile running your code I tried to "
6155
++ "get a column of type: "
6256
++ red (show givenType)
63-
++ " but column was of type: "
57+
++ " but the column in the dataframe was actually of type: "
6458
++ green (show expectedType)
6559

66-
addCallPointInfo :: T.Text -> Maybe T.Text -> String -> String
67-
addCallPointInfo name (Just cp) err =
60+
addCallPointInfo :: Maybe String -> Maybe String -> String -> String
61+
addCallPointInfo (Just name) (Just cp) err =
6862
err
6963
++ ( "\n\tThis happened when calling function "
70-
++ brightGreen (T.unpack cp)
64+
++ brightGreen cp
7165
++ " on the column "
72-
++ brightGreen (T.unpack name)
66+
++ brightGreen name
7367
++ "\n\n"
74-
++ typeAnnotationSuggestion (T.unpack cp)
68+
++ typeAnnotationSuggestion cp
7569
)
76-
addCallPointInfo name Nothing err =
70+
addCallPointInfo Nothing (Just cp) err = err ++ "\n" ++ typeAnnotationSuggestion cp
71+
addCallPointInfo (Just name) Nothing err =
7772
err
7873
++ ( "\n\tOn the column "
79-
++ T.unpack name
74+
++ name
8075
++ "\n\n"
81-
++ typeAnnotationSuggestion "<function>"
8276
)
77+
addCallPointInfo Nothing Nothing err = err
8378

8479
typeAnnotationSuggestion :: String -> String
8580
typeAnnotationSuggestion cp =
8681
"\n\n\tTry adding a type at the end of the function e.g "
8782
++ "change\n\t\t"
88-
++ red (cp ++ " arg1 arg2")
83+
++ red (cp ++ " ...")
8984
++ " to \n\t\t"
90-
++ green ("(" ++ cp ++ " arg1 arg2 :: <Type>)")
85+
++ green ("(" ++ cp ++ " ... :: <Type>)")
9186
++ "\n\tor add "
9287
++ "{-# LANGUAGE TypeApplications #-} to the top of your "
9388
++ "file then change the call to \n\t\t"
94-
++ brightGreen (cp ++ " @<Type> arg1 arg2")
89+
++ brightGreen (cp ++ " @<Type> ....")
9590

9691
guessColumnName :: T.Text -> [T.Text] -> T.Text
9792
guessColumnName userInput columns = case map (\k -> (editDistance userInput k, k)) columns of

0 commit comments

Comments
 (0)