{-# LANGUAGE OverloadedStrings #-}
module Poseidon.CLI.Genoconvert where
import Poseidon.EntityTypes (HasNameAndVersion (..))
import Poseidon.GenotypeData (GenoDataSource (..),
GenotypeDataSpec (..),
GenotypeFormatSpec (..),
loadGenotypeData,
printSNPCopyProgress)
import Poseidon.Package (PackageReadOptions (..),
PoseidonException (PoseidonGenotypeExceptionForward),
PoseidonPackage (..),
defaultPackageReadOptions,
makePseudoPackageFromGenotypeData,
readPoseidonPackageCollection,
writePoseidonPackage)
import Poseidon.Utils (PoseidonIO, envErrorLength,
envInputPlinkMode, envLogAction,
logInfo, logWarning)
import Control.Exception (catch, throwIO)
import Control.Monad (unless, when)
import Data.Maybe (isJust)
import Data.Time (getCurrentTime)
import Pipes (MonadIO (liftIO), runEffect, (>->))
import Pipes.Safe (runSafeT)
import SequenceFormats.Eigenstrat (writeEigenstrat)
import SequenceFormats.Plink (PlinkPopNameMode,
eigenstratInd2PlinkFam, writePlink)
import System.Directory (createDirectoryIfMissing,
doesFileExist, removeFile)
import System.FilePath (dropTrailingPathSeparator, (<.>),
(</>))
data GenoconvertOptions = GenoconvertOptions
{ GenoconvertOptions -> [GenoDataSource]
_genoconvertGenoSources :: [GenoDataSource]
, GenoconvertOptions -> GenotypeFormatSpec
_genoConvertOutFormat :: GenotypeFormatSpec
, GenoconvertOptions -> Bool
_genoConvertOutOnlyGeno :: Bool
, GenoconvertOptions -> Maybe [Char]
_genoMaybeOutPackagePath :: Maybe FilePath
, GenoconvertOptions -> Bool
_genoconvertRemoveOld :: Bool
, GenoconvertOptions -> PlinkPopNameMode
_genoconvertOutPlinkPopMode :: PlinkPopNameMode
, GenoconvertOptions -> Bool
_genoconvertOnlyLatest :: Bool
}
runGenoconvert :: GenoconvertOptions -> PoseidonIO ()
runGenoconvert :: GenoconvertOptions -> PoseidonIO ()
runGenoconvert (GenoconvertOptions [GenoDataSource]
genoSources GenotypeFormatSpec
outFormat Bool
onlyGeno Maybe [Char]
outPath Bool
removeOld PlinkPopNameMode
outPlinkPopMode Bool
onlyLatest) = do
let pacReadOpts :: PackageReadOptions
pacReadOpts = PackageReadOptions
defaultPackageReadOptions {
_readOptIgnoreChecksums :: Bool
_readOptIgnoreChecksums = Bool
True
, _readOptIgnoreGeno :: Bool
_readOptIgnoreGeno = Bool
False
, _readOptGenoCheck :: Bool
_readOptGenoCheck = Bool
True
, _readOptOnlyLatest :: Bool
_readOptOnlyLatest = Bool
onlyLatest
}
[PoseidonPackage]
properPackages <- PackageReadOptions -> [[Char]] -> PoseidonIO [PoseidonPackage]
readPoseidonPackageCollection PackageReadOptions
pacReadOpts ([[Char]] -> PoseidonIO [PoseidonPackage])
-> [[Char]] -> PoseidonIO [PoseidonPackage]
forall a b. (a -> b) -> a -> b
$ [GenoDataSource -> [Char]
getPacBaseDirs GenoDataSource
x | x :: GenoDataSource
x@PacBaseDir {} <- [GenoDataSource]
genoSources]
PlinkPopNameMode
inPlinkPopMode <- PoseidonIO PlinkPopNameMode
envInputPlinkMode
[PoseidonPackage]
pseudoPackages <- (GenotypeDataSpec -> ReaderT Env IO PoseidonPackage)
-> [GenotypeDataSpec] -> PoseidonIO [PoseidonPackage]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
forall (m :: * -> *) a b. Monad m => (a -> m b) -> [a] -> m [b]
mapM GenotypeDataSpec -> ReaderT Env IO PoseidonPackage
makePseudoPackageFromGenotypeData [GenoDataSource -> GenotypeDataSpec
getGenoDirect GenoDataSource
x | x :: GenoDataSource
x@GenoDirect {} <- [GenoDataSource]
genoSources]
[Char] -> PoseidonIO ()
logInfo ([Char] -> PoseidonIO ()) -> [Char] -> PoseidonIO ()
forall a b. (a -> b) -> a -> b
$ [Char]
"Unpackaged genotype data files loaded: " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++ Int -> [Char]
forall a. Show a => a -> [Char]
show ([PoseidonPackage] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [PoseidonPackage]
pseudoPackages)
(PoseidonPackage -> PoseidonIO ())
-> [PoseidonPackage] -> PoseidonIO ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
(a -> m b) -> t a -> m ()
mapM_ (GenotypeFormatSpec
-> Bool
-> Maybe [Char]
-> Bool
-> PlinkPopNameMode
-> PlinkPopNameMode
-> PoseidonPackage
-> PoseidonIO ()
convertGenoTo GenotypeFormatSpec
outFormat Bool
onlyGeno Maybe [Char]
outPath Bool
removeOld PlinkPopNameMode
inPlinkPopMode PlinkPopNameMode
outPlinkPopMode) [PoseidonPackage]
properPackages
(PoseidonPackage -> PoseidonIO ())
-> [PoseidonPackage] -> PoseidonIO ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
(a -> m b) -> t a -> m ()
mapM_ (GenotypeFormatSpec
-> Bool
-> Maybe [Char]
-> Bool
-> PlinkPopNameMode
-> PlinkPopNameMode
-> PoseidonPackage
-> PoseidonIO ()
convertGenoTo GenotypeFormatSpec
outFormat Bool
True Maybe [Char]
outPath Bool
removeOld PlinkPopNameMode
inPlinkPopMode PlinkPopNameMode
outPlinkPopMode) [PoseidonPackage]
pseudoPackages
convertGenoTo :: GenotypeFormatSpec -> Bool -> Maybe FilePath -> Bool -> PlinkPopNameMode ->
PlinkPopNameMode -> PoseidonPackage -> PoseidonIO ()
convertGenoTo :: GenotypeFormatSpec
-> Bool
-> Maybe [Char]
-> Bool
-> PlinkPopNameMode
-> PlinkPopNameMode
-> PoseidonPackage
-> PoseidonIO ()
convertGenoTo GenotypeFormatSpec
outFormat Bool
onlyGeno Maybe [Char]
outPath Bool
removeOld PlinkPopNameMode
inPlinkPopMode PlinkPopNameMode
outPlinkPopMode PoseidonPackage
pac = do
[Char] -> PoseidonIO ()
logInfo ([Char] -> PoseidonIO ()) -> [Char] -> PoseidonIO ()
forall a b. (a -> b) -> a -> b
$
[Char]
"Converting genotype data in "
[Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++ PacNameAndVersion -> [Char]
forall a. Show a => a -> [Char]
show (PoseidonPackage -> PacNameAndVersion
posPacNameAndVersion PoseidonPackage
pac)
[Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++ [Char]
" to format "
[Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++ GenotypeFormatSpec -> [Char]
forall a. Show a => a -> [Char]
show GenotypeFormatSpec
outFormat
[Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++ [Char]
":"
let outName :: [Char]
outName = PacNameAndVersion -> [Char]
forall a. HasNameAndVersion a => a -> [Char]
getPacName (PacNameAndVersion -> [Char])
-> (PoseidonPackage -> PacNameAndVersion)
-> PoseidonPackage
-> [Char]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. PoseidonPackage -> PacNameAndVersion
posPacNameAndVersion (PoseidonPackage -> [Char]) -> PoseidonPackage -> [Char]
forall a b. (a -> b) -> a -> b
$ PoseidonPackage
pac
let ([Char]
outInd, [Char]
outSnp, [Char]
outGeno) = case GenotypeFormatSpec
outFormat of
GenotypeFormatSpec
GenotypeFormatEigenstrat -> ([Char]
outName [Char] -> [Char] -> [Char]
<.> [Char]
".ind", [Char]
outName [Char] -> [Char] -> [Char]
<.> [Char]
".snp", [Char]
outName [Char] -> [Char] -> [Char]
<.> [Char]
".geno")
GenotypeFormatSpec
GenotypeFormatPlink -> ([Char]
outName [Char] -> [Char] -> [Char]
<.> [Char]
".fam", [Char]
outName [Char] -> [Char] -> [Char]
<.> [Char]
".bim", [Char]
outName [Char] -> [Char] -> [Char]
<.> [Char]
".bed")
if GenotypeDataSpec -> GenotypeFormatSpec
format (PoseidonPackage -> GenotypeDataSpec
posPacGenotypeData PoseidonPackage
pac) GenotypeFormatSpec -> GenotypeFormatSpec -> Bool
forall a. Eq a => a -> a -> Bool
== GenotypeFormatSpec
outFormat
then [Char] -> PoseidonIO ()
logWarning [Char]
"The genotype data is already in the requested format"
else do
[Char]
newBaseDir <- case Maybe [Char]
outPath of
Just [Char]
x -> do
[Char] -> PoseidonIO ()
logInfo ([Char] -> PoseidonIO ()) -> [Char] -> PoseidonIO ()
forall a b. (a -> b) -> a -> b
$ [Char]
"Writing to directory (will be created if missing): " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++ [Char]
x
IO () -> PoseidonIO ()
forall a. IO a -> ReaderT Env IO a
forall (m :: * -> *) a. MonadIO m => IO a -> m a
liftIO (IO () -> PoseidonIO ()) -> IO () -> PoseidonIO ()
forall a b. (a -> b) -> a -> b
$ Bool -> [Char] -> IO ()
createDirectoryIfMissing Bool
True ([Char] -> [Char]
dropTrailingPathSeparator [Char]
x)
[Char] -> ReaderT Env IO [Char]
forall a. a -> ReaderT Env IO a
forall (m :: * -> *) a. Monad m => a -> m a
return [Char]
x
Maybe [Char]
Nothing -> [Char] -> ReaderT Env IO [Char]
forall a. a -> ReaderT Env IO a
forall (m :: * -> *) a. Monad m => a -> m a
return ([Char] -> ReaderT Env IO [Char])
-> [Char] -> ReaderT Env IO [Char]
forall a b. (a -> b) -> a -> b
$ PoseidonPackage -> [Char]
posPacBaseDir PoseidonPackage
pac
let ([Char]
outG, [Char]
outS, [Char]
outI) = ([Char]
newBaseDir [Char] -> [Char] -> [Char]
</> [Char]
outGeno, [Char]
newBaseDir [Char] -> [Char] -> [Char]
</> [Char]
outSnp, [Char]
newBaseDir [Char] -> [Char] -> [Char]
</> [Char]
outInd)
Bool
anyExists <- [Bool] -> Bool
forall (t :: * -> *). Foldable t => t Bool -> Bool
or ([Bool] -> Bool) -> ReaderT Env IO [Bool] -> ReaderT Env IO Bool
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> ([Char] -> ReaderT Env IO Bool)
-> [[Char]] -> ReaderT Env IO [Bool]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
forall (m :: * -> *) a b. Monad m => (a -> m b) -> [a] -> m [b]
mapM [Char] -> ReaderT Env IO Bool
checkFile [[Char]
outG, [Char]
outS, [Char]
outI]
if Bool
anyExists
then [Char] -> PoseidonIO ()
logWarning ([Char]
"skipping genotype conversion for " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++ PacNameAndVersion -> [Char]
forall a. Show a => a -> [Char]
show (PoseidonPackage -> PacNameAndVersion
posPacNameAndVersion PoseidonPackage
pac))
else do
[Char] -> PoseidonIO ()
logInfo [Char]
"Processing SNPs..."
LogA
logA <- PoseidonIO LogA
envLogAction
UTCTime
currentTime <- IO UTCTime -> ReaderT Env IO UTCTime
forall a. IO a -> ReaderT Env IO a
forall (m :: * -> *) a. MonadIO m => IO a -> m a
liftIO IO UTCTime
getCurrentTime
ErrorLength
errLength <- PoseidonIO ErrorLength
envErrorLength
IO () -> PoseidonIO ()
forall a. IO a -> ReaderT Env IO a
forall (m :: * -> *) a. MonadIO m => IO a -> m a
liftIO (IO () -> PoseidonIO ()) -> IO () -> PoseidonIO ()
forall a b. (a -> b) -> a -> b
$ IO () -> (SomeException -> IO ()) -> IO ()
forall e a. Exception e => IO a -> (e -> IO a) -> IO a
catch (
SafeT IO () -> IO ()
forall (m :: * -> *) r.
(MonadMask m, MonadIO m) =>
SafeT m r -> m r
runSafeT (SafeT IO () -> IO ()) -> SafeT IO () -> IO ()
forall a b. (a -> b) -> a -> b
$ do
([EigenstratIndEntry]
eigenstratIndEntries, Producer (EigenstratSnpEntry, GenoLine) (SafeT IO) ()
eigenstratProd) <- [Char]
-> GenotypeDataSpec
-> PlinkPopNameMode
-> SafeT
IO
([EigenstratIndEntry],
Producer (EigenstratSnpEntry, GenoLine) (SafeT IO) ())
forall (m :: * -> *).
MonadSafe m =>
[Char]
-> GenotypeDataSpec
-> PlinkPopNameMode
-> m ([EigenstratIndEntry],
Producer (EigenstratSnpEntry, GenoLine) m ())
loadGenotypeData (PoseidonPackage -> [Char]
posPacBaseDir PoseidonPackage
pac) (PoseidonPackage -> GenotypeDataSpec
posPacGenotypeData PoseidonPackage
pac) PlinkPopNameMode
inPlinkPopMode
let outConsumer :: Consumer (EigenstratSnpEntry, GenoLine) (SafeT IO) ()
outConsumer = case GenotypeFormatSpec
outFormat of
GenotypeFormatSpec
GenotypeFormatEigenstrat -> [Char]
-> [Char]
-> [Char]
-> [EigenstratIndEntry]
-> Consumer (EigenstratSnpEntry, GenoLine) (SafeT IO) ()
forall (m :: * -> *).
MonadSafe m =>
[Char]
-> [Char]
-> [Char]
-> [EigenstratIndEntry]
-> Consumer (EigenstratSnpEntry, GenoLine) m ()
writeEigenstrat [Char]
outG [Char]
outS [Char]
outI [EigenstratIndEntry]
eigenstratIndEntries
GenotypeFormatSpec
GenotypeFormatPlink -> [Char]
-> [Char]
-> [Char]
-> [PlinkFamEntry]
-> Consumer (EigenstratSnpEntry, GenoLine) (SafeT IO) ()
forall (m :: * -> *).
MonadSafe m =>
[Char]
-> [Char]
-> [Char]
-> [PlinkFamEntry]
-> Consumer (EigenstratSnpEntry, GenoLine) m ()
writePlink [Char]
outG [Char]
outS [Char]
outI ((EigenstratIndEntry -> PlinkFamEntry)
-> [EigenstratIndEntry] -> [PlinkFamEntry]
forall a b. (a -> b) -> [a] -> [b]
map (PlinkPopNameMode -> EigenstratIndEntry -> PlinkFamEntry
eigenstratInd2PlinkFam PlinkPopNameMode
outPlinkPopMode) [EigenstratIndEntry]
eigenstratIndEntries)
Effect (SafeT IO) () -> SafeT IO ()
forall (m :: * -> *) r. Monad m => Effect m r -> m r
runEffect (Effect (SafeT IO) () -> SafeT IO ())
-> Effect (SafeT IO) () -> SafeT IO ()
forall a b. (a -> b) -> a -> b
$ Producer (EigenstratSnpEntry, GenoLine) (SafeT IO) ()
eigenstratProd Producer (EigenstratSnpEntry, GenoLine) (SafeT IO) ()
-> Proxy
()
(EigenstratSnpEntry, GenoLine)
()
(EigenstratSnpEntry, GenoLine)
(SafeT IO)
()
-> Producer (EigenstratSnpEntry, GenoLine) (SafeT IO) ()
forall (m :: * -> *) a' a b r c' c.
Functor m =>
Proxy a' a () b m r -> Proxy () b c' c m r -> Proxy a' a c' c m r
>-> LogA
-> UTCTime
-> Proxy
()
(EigenstratSnpEntry, GenoLine)
()
(EigenstratSnpEntry, GenoLine)
(SafeT IO)
()
forall (m :: * -> *) a.
MonadIO m =>
LogA -> UTCTime -> Pipe a a m ()
printSNPCopyProgress LogA
logA UTCTime
currentTime Producer (EigenstratSnpEntry, GenoLine) (SafeT IO) ()
-> Consumer (EigenstratSnpEntry, GenoLine) (SafeT IO) ()
-> Effect (SafeT IO) ()
forall (m :: * -> *) a' a b r c' c.
Functor m =>
Proxy a' a () b m r -> Proxy () b c' c m r -> Proxy a' a c' c m r
>-> Consumer (EigenstratSnpEntry, GenoLine) (SafeT IO) ()
outConsumer
) (PoseidonException -> IO ()
forall e a. Exception e => e -> IO a
throwIO (PoseidonException -> IO ())
-> (SomeException -> PoseidonException) -> SomeException -> IO ()
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ErrorLength -> SomeException -> PoseidonException
PoseidonGenotypeExceptionForward ErrorLength
errLength)
[Char] -> PoseidonIO ()
logInfo [Char]
"Done"
Bool -> PoseidonIO () -> PoseidonIO ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
unless (Bool
onlyGeno Bool -> Bool -> Bool
|| Maybe [Char] -> Bool
forall a. Maybe a -> Bool
isJust Maybe [Char]
outPath) (PoseidonIO () -> PoseidonIO ()) -> PoseidonIO () -> PoseidonIO ()
forall a b. (a -> b) -> a -> b
$ do
let genotypeData :: GenotypeDataSpec
genotypeData = GenotypeFormatSpec
-> [Char]
-> Maybe [Char]
-> [Char]
-> Maybe [Char]
-> [Char]
-> Maybe [Char]
-> Maybe SNPSetSpec
-> GenotypeDataSpec
GenotypeDataSpec GenotypeFormatSpec
outFormat [Char]
outGeno Maybe [Char]
forall a. Maybe a
Nothing [Char]
outSnp Maybe [Char]
forall a. Maybe a
Nothing [Char]
outInd Maybe [Char]
forall a. Maybe a
Nothing (GenotypeDataSpec -> Maybe SNPSetSpec
snpSet (GenotypeDataSpec -> Maybe SNPSetSpec)
-> (PoseidonPackage -> GenotypeDataSpec)
-> PoseidonPackage
-> Maybe SNPSetSpec
forall b c a. (b -> c) -> (a -> b) -> a -> c
. PoseidonPackage -> GenotypeDataSpec
posPacGenotypeData (PoseidonPackage -> Maybe SNPSetSpec)
-> PoseidonPackage -> Maybe SNPSetSpec
forall a b. (a -> b) -> a -> b
$ PoseidonPackage
pac)
newPac :: PoseidonPackage
newPac = PoseidonPackage
pac { posPacGenotypeData :: GenotypeDataSpec
posPacGenotypeData = GenotypeDataSpec
genotypeData }
[Char] -> PoseidonIO ()
logInfo [Char]
"Adjusting POSEIDON.yml..."
IO () -> PoseidonIO ()
forall a. IO a -> ReaderT Env IO a
forall (m :: * -> *) a. MonadIO m => IO a -> m a
liftIO (IO () -> PoseidonIO ()) -> IO () -> PoseidonIO ()
forall a b. (a -> b) -> a -> b
$ PoseidonPackage -> IO ()
writePoseidonPackage PoseidonPackage
newPac
Bool -> PoseidonIO () -> PoseidonIO ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when Bool
removeOld (PoseidonIO () -> PoseidonIO ()) -> PoseidonIO () -> PoseidonIO ()
forall a b. (a -> b) -> a -> b
$ IO () -> PoseidonIO ()
forall a. IO a -> ReaderT Env IO a
forall (m :: * -> *) a. MonadIO m => IO a -> m a
liftIO (IO () -> PoseidonIO ()) -> IO () -> PoseidonIO ()
forall a b. (a -> b) -> a -> b
$ ([Char] -> IO ()) -> [[Char]] -> IO ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
(a -> m b) -> t a -> m ()
mapM_ [Char] -> IO ()
removeFile [
PoseidonPackage -> [Char]
posPacBaseDir PoseidonPackage
pac [Char] -> [Char] -> [Char]
</> GenotypeDataSpec -> [Char]
genoFile (PoseidonPackage -> GenotypeDataSpec
posPacGenotypeData PoseidonPackage
pac)
, PoseidonPackage -> [Char]
posPacBaseDir PoseidonPackage
pac [Char] -> [Char] -> [Char]
</> GenotypeDataSpec -> [Char]
snpFile (PoseidonPackage -> GenotypeDataSpec
posPacGenotypeData PoseidonPackage
pac)
, PoseidonPackage -> [Char]
posPacBaseDir PoseidonPackage
pac [Char] -> [Char] -> [Char]
</> GenotypeDataSpec -> [Char]
indFile (PoseidonPackage -> GenotypeDataSpec
posPacGenotypeData PoseidonPackage
pac)
]
where
checkFile :: FilePath -> PoseidonIO Bool
checkFile :: [Char] -> ReaderT Env IO Bool
checkFile [Char]
fn = do
Bool
fe <- IO Bool -> ReaderT Env IO Bool
forall a. IO a -> ReaderT Env IO a
forall (m :: * -> *) a. MonadIO m => IO a -> m a
liftIO (IO Bool -> ReaderT Env IO Bool) -> IO Bool -> ReaderT Env IO Bool
forall a b. (a -> b) -> a -> b
$ [Char] -> IO Bool
doesFileExist [Char]
fn
Bool -> PoseidonIO () -> PoseidonIO ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when Bool
fe (PoseidonIO () -> PoseidonIO ()) -> PoseidonIO () -> PoseidonIO ()
forall a b. (a -> b) -> a -> b
$ [Char] -> PoseidonIO ()
logWarning ([Char] -> PoseidonIO ()) -> [Char] -> PoseidonIO ()
forall a b. (a -> b) -> a -> b
$ [Char]
"File " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++ [Char]
fn [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++ [Char]
" exists"
Bool -> ReaderT Env IO Bool
forall a. a -> ReaderT Env IO a
forall (m :: * -> *) a. Monad m => a -> m a
return Bool
fe