Skip to content

Commit 55b7b4b

Browse files
committed
Some parquet stuff that should be in a branch but it's just me for now
1 parent 2260430 commit 55b7b4b

1 file changed

Lines changed: 29 additions & 1 deletion

File tree

src/DataFrame/IO/Parquet.hs

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -304,9 +304,31 @@ readParquet path = withBinaryFile path ReadMode $ \handle -> do
304304
when (magicString /= "PAR1") $ error "Invalid Parquet file"
305305

306306
metadata <- readMetadata handle size
307-
print metadata
307+
-- print metadata
308+
forM_ (rowGroups metadata) $ \r -> do
309+
forM_ (rowGroupColumns r) $ \c -> do
310+
-- print c
311+
let metadata = columnMetaData c
312+
let colDataPageOffset = columnDataPageOffset metadata
313+
let colDictionaryPageOffset = columnDictionaryPageOffset metadata
314+
let colStart = if colDictionaryPageOffset > 0 && colDataPageOffset > colDictionaryPageOffset
315+
then colDictionaryPageOffset
316+
else colDataPageOffset
317+
let colLength = columnTotalCompressedSize metadata
318+
-- print (colStart, colLength)
319+
columnBytes <-readBytes handle colStart colLength
320+
print $ columnBytes
308321
return DI.empty
309322

323+
readBytes :: Handle -> Int64 -> Int64 -> IO [Word8]
324+
readBytes handle colStart colLen = do
325+
buf <- mallocBytes (fromIntegral colLen) :: IO (Ptr Word8)
326+
hSeek handle AbsoluteSeek (fromIntegral colStart)
327+
_ <- hGetBuf handle buf (fromIntegral colLen)
328+
columnBytes <- readByteString' buf colLen
329+
free buf
330+
pure columnBytes
331+
310332
numBytesInFile :: Handle -> IO Integer
311333
numBytesInFile handle = do
312334
hSeek handle SeekFromEnd 0
@@ -958,6 +980,9 @@ readByteString buf pos = do
958980
size <- readVarIntFromBuffer @Int buf pos
959981
replicateM size (readAndAdvance pos buf)
960982

983+
readByteString' :: Ptr Word8 -> Int64 -> IO [Word8]
984+
readByteString' buf size = mapM (`readSingleByte` buf) [0..(size - 1)]
985+
961986
readField :: Ptr Word8 -> IORef Int -> Int16 -> [Int16] -> IO (Maybe (TType, Int16))
962987
readField buf pos lastFieldId fieldStack = do
963988
t <- readAndAdvance pos buf
@@ -979,6 +1004,9 @@ readAndAdvance bufferPos buffer = do
9791004
modifyIORef bufferPos (+ 1)
9801005
return b
9811006

1007+
readSingleByte :: Int64 -> Ptr b -> IO Word8
1008+
readSingleByte pos buffer = peekByteOff buffer (fromIntegral pos)
1009+
9821010
readNoAdvance :: IORef Int -> Ptr b -> IO Word8
9831011
readNoAdvance bufferPos buffer = do
9841012
pos <- readIORef bufferPos

0 commit comments

Comments
 (0)