DataHaskell
diff --git a/‎benchmark/Main.hs‎
Lines changed: 3 additions & 3 deletions b/‎benchmark/Main.hs‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/coming_from_pandas.md‎
Lines changed: 11 additions & 11 deletions b/‎docs/coming_from_pandas.md‎
Lines changed: 11 additions & 11 deletions
diff --git a/‎docs/coming_from_polars.md‎
Lines changed: 21 additions & 43 deletions b/‎docs/coming_from_polars.md‎
Lines changed: 21 additions & 43 deletions
diff --git a/‎docs/haskell_for_data_analysis.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/haskell_for_data_analysis.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/DataFrame/Errors.hs‎
Lines changed: 27 additions & 32 deletions b/‎src/DataFrame/Errors.hs‎
Lines changed: 27 additions & 32 deletions
@@ -13,9 +13,9 @@ stats n = do
   ns <- VU.replicateM n (randomRIO (-20.0 :: Double, 20.0))
   xs <- VU.replicateM n (randomRIO (-20.0 :: Double, 20.0))
   ys <- VU.replicateM n (randomRIO (-20.0 :: Double, 20.0))
-  let df = D.fromList [("first", D.UnboxedColumn ns),
-                       ("second", D.UnboxedColumn xs),
-                       ("third", D.UnboxedColumn ys)]
+  let df = D.fromNamedColumns [("first", D.UnboxedColumn ns),
+                               ("second", D.UnboxedColumn xs),
+                               ("third", D.UnboxedColumn ys)]
 
   print $ D.mean "first" df
   print $ D.variance "second" df
 
@@ -26,7 +26,7 @@ dtype: float64
 
 ```haskell
 ghci> import qualified DataFrame as D
-ghci> D.toColumn [1, 3, 5, read @Float "NaN", 6, 8]
+ghci> D.fromList [1, 3, 5, read @Float "NaN", 6, 8]
 [1.0,3.0,5.0,NaN,6.0,8.0]
 ```
 
@@ -40,7 +40,7 @@ DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
 
 ```haskell
 ghci> import Data.Time.Calendar
-ghci> dates = D.toColumn $ Prelude.take 6 $ [fromGregorian 2013 01 01..]
+ghci> dates = D.fromList $ Prelude.take 6 $ [fromGregorian 2013 01 01..]
 ghci> dates
 [2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06]
 ```
@@ -65,7 +65,7 @@ ghci> import System.Random (randomRIO)
 ghci> import Control.Monad (replicateM)
 ghci> import Data.List (foldl')
 ghci> :set -XOverloadedStrings
-ghci> initDf = D.fromList [("date", dates)]
+ghci> initDf = D.fromNamedColumns [("date", dates)]
 ghci> ns <- replicateM 4 (replicateM 6 (randomRIO (-2.0, 2.0)))
 ghci> df = foldl' (\d (name, col) -> D.insertColumn name (V.fromList col) d) initDf (zip ["A","B","C","D"] ns)
 ghci> df
@@ -82,7 +82,7 @@ index |    date    |          A          |          B           |          C
 5     | 2013-01-06 | -0.5541246187320041 | -1.5791034339829042  | -1.5650415391333796  | -1.7802523632196152
 ```
 
-As hinted in the previous example we can create a dataframe with `fromList`. This function takes in a list of tuples. We don't broadast values like python does i.e if you put in a single value into a column all other values will be null/nothing. But we'll detail how to get the same functionality.
+As hinted in the previous example we can create a dataframe with `fromNamedColumns`. This function takes in a list of tuples. We don't broadast values like python does i.e if you put in a single value into a column all other values will be null/nothing. But we'll detail how to get the same functionality.
 
 ```python
 df2 = pd.DataFrame(
@@ -110,13 +110,13 @@ df2 = pd.DataFrame(
 -- All our data types must be printable and orderable.
 data Transport = Test | Train deriving (Show, Ord, Eq)
 ghci> :{
-ghci| df = D.fromList [
-ghci|        ("A", D.toColumn (replicate 4 1.0)),
-ghci|        ("B", D.toColumn (replicate 4 (fromGregorian 2013 01 02))),
-ghci|        ("C", D.toColumn (replicate 4 (1.0 :: Float))),
-ghci|        ("D", D.toColumn (replicate 4 (3 :: Int))),
-ghci|        ("E", D.toColumn (take 4 $ cycle [Test, Train])),
-ghci|        ("F", D.toColumn (replicate 4 "foo"))]
+ghci| df = D.fromNamedColumns [
+ghci|        ("A", D.fromList (replicate 4 1.0)),
+ghci|        ("B", D.fromList (replicate 4 (fromGregorian 2013 01 02))),
+ghci|        ("C", D.fromList (replicate 4 (1.0 :: Float))),
+ghci|        ("D", D.fromList (replicate 4 (3 :: Int))),
+ghci|        ("E", D.fromList (take 4 $ cycle [Test, Train])),
+ghci|        ("F", D.fromList (replicate 4 "foo"))]
 ghci|:}
 ghci> df
 --------------------------------------------------------------
 
@@ -42,16 +42,16 @@ import Data.Time.Calendar
 main :: IO
 main = do
     let df = D.fromList [
-        ("name", D.toColumn [ "Alice Archer"
+        ("name", D.fromList [ "Alice Archer"
                             , "Ben Brown"
                             , "Chloe Cooper"
                             , "Daniel Donovan"])
-        , ("birthdate", D.toColumn [ fromGregorian 1997 01 10
+        , ("birthdate", D.fromList [ fromGregorian 1997 01 10
                                    , fromGregorian 1985 02 15
                                    , fromGregorian 1983 03 22
                                    , fromGregorian 1981 04 30])
-        , ("weight", D.toColumn [57.9, 72.5, 53.6, 83.1])
-        , ("height", D.toColumn [1.56, 1.77, 1.65, 1.75])]
+        , ("weight", D.fromList [57.9, 72.5, 53.6, 83.1])
+        , ("height", D.fromList [1.56, 1.77, 1.65, 1.75])]
     print df
     D.writeCsv "./data/output.csv" df
     let df_csv = D.readCsv "./data/output.csv"
@@ -88,12 +88,7 @@ Notice that the type of the string column changes from `[Char]` (Haskell's defau
 
 ## Expressions
 
-Our equivalent to expressions is a tuple that contains a list of the column names followed by a
-function where the arguments correspond to the order of column names. We use a special function
-wrapper to make our dataframes accept functions with any number of arguments. This is done using
-the `func` function.
-
-This is a mouthful and is probably easier to see in action/comparison.
+We support expressions similar to Polars and PySpark. These expressions help us write row-level computations.
 
 For example:
 
@@ -122,31 +117,8 @@ main = do
     ...
     let year = (\(YearMonthDay y _ _) -> y)
     print $ df_csv
-          |> D.derive "birth_year" (lift year (D.col @Day "birthdate"))
-          |> D.derive "bmi" ((D.col @Double "weight") / (D.lift2 (**) (D.col @Double "height") (D.lit 2)))
-          |> D.select ["name", "birth_year", "bmi"]
-```
-
-Or, more clearly:
-
-```haskell
-{-# LANGUAGE ScopedTypeVariables #-}
-{-# LANGUAGE TypeApplications #-}
-import qualified DataFrame as D
-import qualified Data.Text as T
-
-import DataFrame ( (|>) )
-import Data.Time.Calendar
-
-main :: IO ()
-main = do
-    ...
-    let year = (\(YearMonthDay y _ _) -> y)
-    let bmi :: Double -> Double -> Double
-        bmi w h = w / h ** 2
-    print $ df_csv
-          |> D.derive "birth_year" (lift year (D.col @Day "birthdate"))
-          |> D.derive "bmi" ((D.col @Double "weight") / (D.lift2 (**) (D.col @Double "height") (D.lit 2)))
+          |> D.derive "birth_year" (D.lift year (D.col @Day "birthdate"))
+          |> D.derive "bmi" ((D.col @Double "weight") / (D.col @Double "height" ** D.lit 2))
           |> D.select ["name", "birth_year", "bmi"]
 ```
 
@@ -164,12 +136,15 @@ index |      name      | birth_year |        bmi
 3     | Daniel Donovan | 1981       | 27.13469387755102 
 ```
 
-The dataframe implementation can be read top down. `apply` a function that gets the year to the `birthdate`;
-store the result in the `birth_year` column; combine `weight` and `height` into the bmi column using the
-formula `w / h ** 2`; then select the `name`, `birth_year` and `bmi` fields.
 
-Dataframe focuses on splitting transformations into transformations on the whole dataframe so it's easily usable
-in a repl-like environment.
+The Haskell implementation can be read top down:
+* Create a column called `birth_year` by getting the year from the `birthdate` column.
+* Create a column called `bmi`which is computed as `weight / height ** 2`, 
+* then select the `name`, `birth_year` and `bmi` fields.
+
+`lift` takes a regular, unary (one argument) Haskell function and applied it to a column. To apply a binary function to two columns we use `lift2`.
+
+The Polars column type can be a single column or a list of columns. This means that applying a single transformation to many columns can be written as follows:
 
 In the example Polars expression expansion example:
 
@@ -181,7 +156,7 @@ result = df.select(
 print(result)
 ```
 
-We instead write this two `applyWithAlias` calls:
+In Haskell, we don't provide a way of doing this out of the box. So you'd have to write something more explicit:
 
 ```haskell
 df_csv
@@ -202,7 +177,7 @@ index |      name      |     height-5%      |     weight-5%
 3     | Daniel Donovan | 1.6624999999999999 | 78.945
 ```
 
-However we can make our program shorter by using regular Haskell and folding over the dataframe.
+We can use standard Haskell machinery to make the program short without sactificing readability.
 
 ```haskell
 let reduce name = D.derive (name <> "-5%") ((col @Double name) * (lit 0.95))
@@ -211,16 +186,19 @@ df_csv
     |> D.select ["name", "weight-5%", "height-5%"]
 ```
 
-Or alternatively,
+Or alternatively, if our transformation only involves the variable we are modifying we can write the same code as follows:
 
 ```haskell
 addSuffix suffix name = D.rename name (name <> suffix)
 df_csv
   |> D.applyMany ["weight", "height"] (*0.95)
+  -- We have to rename the fields so they match what we had before.
   |> D.fold (addSuffix "-5%")
   |> D.select ["name", "weight-5%", "height-5%"]
 ```
 
+This means that we can still rely on the expressive power of Haskell itself without relying entirely on the column expressions. This keeps our implementation more flexible.
+
 Filtering looks much the same:
 
 ```python
 
@@ -10,7 +10,7 @@ In Haskell, potentially missing values are represented by a "wrapper" type calle
 
 ```
 ghci> import qualified DataFrame as D
-ghci> let df = D.fromColumnList [D.toColumn [Just 1, Just 1, Nothing, Nothing], D.toColumn [Just 6.5, Nothing, Nothing, Just 6.5], D.toColumn [Just 3.0, Nothing, Nothing, Just 3.0]]
+ghci> let df = D.fromColumnList [D.fromList[Just 1, Just 1, Nothing, Nothing], D.fromList[Just 6.5, Nothing, Nothing, Just 6.5], D.fromList[Just 3.0, Nothing, Nothing, Just 3.0]]
 ghci> df
 ---------------------------------------------------
 index |       0       |      1       |      2      
 
@@ -10,30 +10,30 @@ import qualified Data.Text as T
 
 import Control.Exception
 import Data.Array
+import Data.Either
 import DataFrame.Display.Terminal.Colours
 import Data.Typeable (Typeable)
 import Type.Reflection (TypeRep)
 
+data TypeErrorContext a b = MkTypeErrorContext
+  { userType            :: Either String (TypeRep a)
+  , expectedType        :: Either String (TypeRep b)
+  , errorColumnName     :: Maybe String
+  , callingFunctionName :: Maybe String
+  }
+
 data DataFrameException where
     TypeMismatchException :: forall a b. (Typeable a, Typeable b)
-                          => TypeRep a -- ^ given type
-                          -> TypeRep b -- ^ expected type
-                          -> T.Text    -- ^ column name
-                          -> T.Text    -- ^ call point
+                          => TypeErrorContext a b
                           -> DataFrameException
-    TypeMismatchException' :: forall a . (Typeable a)
-                           => TypeRep a -- ^ given type
-                           -> String    -- ^ expected type
-                           -> T.Text    -- ^ column name
-                           -> T.Text    -- ^ call point
-                           -> DataFrameException
     ColumnNotFoundException :: T.Text -> T.Text -> [T.Text] -> DataFrameException
     deriving (Exception)
 
 instance Show DataFrameException where
     show :: DataFrameException -> String
-    show (TypeMismatchException a b columnName callPoint) = addCallPointInfo columnName (Just callPoint) (typeMismatchError a b)
-    show (TypeMismatchException' a b columnName callPoint) = addCallPointInfo columnName (Just callPoint) (typeMismatchError' (show a) b)
+    show (TypeMismatchException context) = let
+        errorString = typeMismatchError (either id show (userType context)) (either id show (expectedType context))
+      in addCallPointInfo (errorColumnName context) (callingFunctionName context) errorString
     show (ColumnNotFoundException columnName callPoint availableColumns) = columnNotFound columnName callPoint availableColumns
 
 columnNotFound :: T.Text -> T.Text -> [T.Text] -> String
@@ -47,51 +47,46 @@ columnNotFound name callPoint columns =
     ++ T.unpack (guessColumnName name columns)
     ++ "?\n\n"
 
-typeMismatchError ::
-  Type.Reflection.TypeRep a ->
-  Type.Reflection.TypeRep b ->
-  String
-typeMismatchError a b = typeMismatchError' (show a) (show b)
-
-typeMismatchError' :: String -> String -> String
-typeMismatchError' givenType expectedType =
+typeMismatchError :: String -> String -> String
+typeMismatchError givenType expectedType =
   red $
     red "\n\n[Error]: Type Mismatch"
       ++ "\n\tWhile running your code I tried to "
       ++ "get a column of type: "
       ++ red (show givenType)
-      ++ " but column was of type: "
+      ++ " but the column in the dataframe was actually of type: "
       ++ green (show expectedType)
 
-addCallPointInfo :: T.Text -> Maybe T.Text -> String -> String
-addCallPointInfo name (Just cp) err =
+addCallPointInfo :: Maybe String -> Maybe String -> String -> String
+addCallPointInfo (Just name) (Just cp) err =
   err
     ++ ( "\n\tThis happened when calling function "
-           ++ brightGreen (T.unpack cp)
+           ++ brightGreen cp
            ++ " on the column "
-           ++ brightGreen (T.unpack name)
+           ++ brightGreen name
            ++ "\n\n"
-           ++ typeAnnotationSuggestion (T.unpack cp)
+           ++ typeAnnotationSuggestion cp
        )
-addCallPointInfo name Nothing err =
+addCallPointInfo Nothing (Just cp) err = err ++ "\n" ++ typeAnnotationSuggestion cp
+addCallPointInfo (Just name) Nothing err =
   err
     ++ ( "\n\tOn the column "
-           ++ T.unpack name
+           ++ name
            ++ "\n\n"
-           ++ typeAnnotationSuggestion "<function>"
        )
+addCallPointInfo Nothing Nothing err = err
 
 typeAnnotationSuggestion :: String -> String
 typeAnnotationSuggestion cp =
   "\n\n\tTry adding a type at the end of the function e.g "
     ++ "change\n\t\t"
-    ++ red (cp ++ " arg1 arg2")
+    ++ red (cp ++ " ...")
     ++ " to \n\t\t"
-    ++ green ("(" ++ cp ++ " arg1 arg2 :: <Type>)")
+    ++ green ("(" ++ cp ++ " ... :: <Type>)")
     ++ "\n\tor add "
     ++ "{-# LANGUAGE TypeApplications #-} to the top of your "
     ++ "file then change the call to \n\t\t"
-    ++ brightGreen (cp ++ " @<Type> arg1 arg2")
+    ++ brightGreen (cp ++ " @<Type> ....")
 
 guessColumnName :: T.Text -> [T.Text] -> T.Text
 guessColumnName userInput columns = case map (\k -> (editDistance userInput k, k)) columns of