Aelve Codesearch

grep over package repositories
BCMtools-0.1.1
src/BCM.hs
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TypeFamilies #-}
module BCM
    ( ContactMap(..)
    , createContactMap
    , saveContactMap
    , openContactMap
    , closeContactMap
    ) where

import Control.Applicative ((<$>))
import Control.Monad (when, guard)
import Control.Monad.IO.Class (MonadIO(..))
import qualified Data.ByteString as L
import qualified Data.ByteString.Char8 as B
import qualified Data.HashMap.Strict as M
import Data.Serialize
import Data.Conduit
import qualified Data.Conduit.List as CL
import System.IO
import Data.List (foldl')
import Data.Word (Word32)
import Text.Printf (printf)

import qualified BCM.DiskMatrix as DM
import qualified BCM.IOMatrix as IOM

-- contact map binary format
-- 4 bytes magic + 4 byte Int (matrix start) + 4 bytes Int (step) + chroms + 1 bytes (reserve) + matrix
data ContactMap m = ContactMap
    { _rowLabels :: M.HashMap B.ByteString (Int, Int)
    , _colLabels :: M.HashMap B.ByteString (Int, Int)
    , _resolution :: Int
    , _matrix :: m
    , _handle :: Handle
    }


contact_map_magic :: Word32
contact_map_magic = 0x9921ABF0

createContactMap :: (IOM.IOMatrix m t Double, MonadIO io, mat ~ m t Double)
                 => FilePath
                 -> [(B.ByteString, Int)]
                 -> [(B.ByteString, Int)]
                 -> Int
                 -> Maybe Int
                 -> Sink (B.ByteString, Int, B.ByteString, Int, Double) io (ContactMap mat)
createContactMap fl rowChr colChr res len = do
    h <- liftIO $ openFile fl ReadWriteMode

    let source  = CL.mapM $ \(chr1, i, chr2, j, v) -> do
            let (p1, s1) = M.lookupDefault errMsg chr1 rLab
                (p2, s2) = M.lookupDefault errMsg chr2 cLab
                i' = i `div` res + p1
                j' = j `div` res + p2
            when (i > s1 || j > s2) $ error "createContactMap: Index out of bounds"
            when (i `mod` res /= 0 || j `mod` res /=0) $
                liftIO $ hPutStrLn stderr $ printf "(%d,%d) is not divisible by %d" i j res
            return ((i',j'), v)


    liftIO $ L.hPut h $ L.replicate (fromIntegral offset) 0

    m <- source $= IOM.hCreateMatrix h (r,c) len

    return $ ContactMap rLab cLab res m h
  where
    r = foldl' (\acc (_,x) -> acc + (x - 1) `div` res + 1)  0 rowChr
    c = foldl' (\acc (_,x) -> acc + (x - 1) `div` res + 1)  0 colChr
    rLab = mkLabels rowChr res
    cLab = mkLabels colChr res
    nByte x = let n1 = foldl' (+) 0 $ map B.length $ fst $ unzip x
                  n2 = 16 * n3
                  n3 = length x
               in n1 + n2 + n3
    offset = 4 + 4 + 4 + nByte rowChr + nByte colChr + 2 + 1
    errMsg = error "createContactMap: Unknown chromosome"

saveContactMap :: (IOM.IOMatrix m t a, mat ~ m t a) => ContactMap mat -> IO ()
saveContactMap (ContactMap rowChr colChr res mat handle) = do
        hSeek handle AbsoluteSeek 0
        L.hPutStr handle . runPut . putWord32le $ contact_map_magic
        L.hPutStr handle . runPut . putWord32le $ offset
        L.hPutStr handle . runPut . putWord32le . fromIntegral $ res
        L.hPutStr handle rowAndcol
        L.hPutStr handle . runPut . putWord8 $ 0
        IOM.hSaveMatrix handle mat
      where
        rowAndcol = L.concat [rows, "\0", cols, "\0"]
        rows = encodeLab . M.toList $ rowChr
        cols = encodeLab . M.toList $ colChr
        encodeLab xs = L.concat $ concatMap (\(chr, (a,b)) ->
            [chr, "\0", runPut $ putWord64le $ fromIntegral a, runPut $ putWord64le $ fromIntegral b]) xs
        offset = fromIntegral $ 4 + 4 + 4 + L.length rowAndcol + 1

openContactMap :: (IOM.IOMatrix m t a, MonadIO io, mat ~ m t a) => FilePath -> io (ContactMap mat)
openContactMap fl = liftIO $ do
    h <- openFile fl ReadWriteMode

    Right magic <- runGet getWord32le <$> L.hGet h 4
    guard $ magic == contact_map_magic
    _ <- runGet getWord32le <$> L.hGet h 4
    Right res <- fmap fromIntegral . runGet getWord32le <$> L.hGet h 4

    rows <- M.fromList <$> getChrs [] h
    cols <- M.fromList <$> getChrs [] h
    _ <- runGet getWord8 <$> L.hGet h 1
    mat <- IOM.hReadMatrix h
    x <- IOM.unsafeIndexM mat (1,1)
    return $ ContactMap rows cols res mat h
  where
    getChrs acc h = do
        chr <- getByteStringNul h
        if B.null chr
           then return acc
           else do
               Right a <- fmap fromIntegral . runGet getWord64le <$> L.hGet h 8
               Right b <- fmap fromIntegral . runGet getWord64le <$> L.hGet h 8
               getChrs ((chr, (a, b)) : acc) h
    getByteStringNul h = B.concat <$> go []
      where
        go acc = do
            x <- B.hGet h 1
            case x of
                "\0" -> return acc
                _ -> go $ acc ++ [x]

closeContactMap :: ContactMap mat -> IO ()
closeContactMap cm = hClose $ _handle cm


-------------------------------------------------------------------------------
-- Helper functions
-------------------------------------------------------------------------------

mkLabels :: [(B.ByteString, Int)] -> Int -> M.HashMap B.ByteString (Int, Int)
mkLabels xs step = M.fromList $ foldr f [] xs
  where
    f (chr, size) [] = [(chr, (0, size))]
    f (chr, size) acc@((_, (a, b)) : _) = (chr, (a + ((b - 1) `div` step + 1), size)) : acc
{-# INLINE mkLabels #-}