Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bibliography API and listing functionality #320

Merged
merged 13 commits into from
Jan 10, 2025
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
- V 1.5.9.0:
- Added a feature to list bibliography information via `trident list --bibliography`.
- Added a new Server API `/bibliography` to serve bibliography information via HTTP.
- V 1.5.7.4:
- Fixed a bug that broke the long-form genotype data input option (with `--genoFile + --snpFile + ...`).
- V 1.5.7.3:
Expand Down
2 changes: 1 addition & 1 deletion poseidon-hs.cabal
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name: poseidon-hs
version: 1.5.7.4
version: 1.5.9.0
synopsis: A package with tools for working with Poseidon genotype data
description: The tools in this package read and analyse Poseidon-formatted genotype databases, a modular system for storing genotype data from thousands of individuals.
license: MIT
Expand Down
75 changes: 66 additions & 9 deletions src/Poseidon/CLI/List.hs
Original file line number Diff line number Diff line change
Expand Up @@ -5,22 +5,23 @@
import Poseidon.EntityTypes (HasNameAndVersion (..))
import Poseidon.Package (PackageReadOptions (..),
defaultPackageReadOptions,
getAllGroupInfo,
getAllGroupInfo, getBibliographyInfo,
getExtendedIndividualInfo,
packagesToPackageInfos,
readPoseidonPackageCollection)
import Poseidon.ServerClient (AddJannoColSpec (..),
ApiReturnData (..),
import Poseidon.ServerClient (AddColSpec (..), ApiReturnData (..),
ArchiveEndpoint (..),
BibliographyInfo (..),
ExtendedIndividualInfo (..),
GroupInfo (..), PackageInfo (..),
processApiResponse, qDefault)
import Poseidon.Utils (PoseidonIO, logInfo, logWarning)

import Control.Monad (forM_, when)
import Control.Monad.IO.Class (liftIO)
import Data.List (intercalate, sortOn)
import Data.List (intercalate, nub, sortOn)
import Data.Maybe (catMaybes, fromMaybe)
import qualified Data.Text as T
import Data.Version (Version, showVersion)

import Text.Layout.Table (asciiRoundS, column, def, expandUntil,
Expand All @@ -39,7 +40,8 @@
-- | A datatype to represent the options what to list
data ListEntity = ListPackages
| ListGroups
| ListIndividuals AddJannoColSpec
| ListIndividuals AddColSpec
| ListBibliography AddColSpec

-- | The main function running the list command
runList :: ListOptions -> PoseidonIO ()
Expand Down Expand Up @@ -95,9 +97,9 @@
RepoRemote (ArchiveEndpoint remoteURL archive) -> do
logInfo "Downloading individual data from server"
let addJannoColFlag = case addJannoColSpec of
AddJannoColAll -> "&additionalJannoColumns=ALL"
AddJannoColList [] -> ""
AddJannoColList moreJannoColumns -> "&additionalJannoColumns=" ++ intercalate "," moreJannoColumns
AddColAll -> "&additionalJannoColumns=ALL"
AddColList [] -> ""

Check warning on line 101 in src/Poseidon/CLI/List.hs

View check run for this annotation

Codecov / codecov/patch

src/Poseidon/CLI/List.hs#L101

Added line #L101 was not covered by tests
AddColList moreJannoColumns -> "&additionalJannoColumns=" ++ intercalate "," moreJannoColumns
apiReturn <- processApiResponse (remoteURL ++ "/individuals" ++ qDefault archive ++ addJannoColFlag) False
case apiReturn of
ApiReturnExtIndividualInfo indInfo -> return indInfo
Expand All @@ -114,7 +116,7 @@
-- we only output this warning if the columns were requested explicitly. Not if
-- all columns were requested. We consider such an "all" request to mean "all columns that are present".
case addJannoColSpec of
AddJannoColList (_:_) -> do
AddColList (_:_) -> do
forM_ (zip [0..] addJannoCols) $ \(i, columnKey) -> do
-- check entries in all individuals for that key
let nonEmptyEntries = catMaybes [snd (entries !! i) | ExtendedIndividualInfo _ _ _ _ entries <- extIndInfos]
Expand All @@ -129,6 +131,56 @@
showMaybeVersion (getPacVersion i), show isLatest] ++
map (fromMaybe "n/a" . snd) addColumnEntries
return (tableH, tableB)
ListBibliography addColSpec -> do
bibInfos <- case repoLocation of
RepoRemote (ArchiveEndpoint remoteURL archive) -> do
logInfo "Downloading bibliography data from server"
let addJannoColFlag = case addColSpec of
AddColAll -> "&additionalBibColumns=ALL"
AddColList [] -> ""
AddColList moreBibFields -> "&additionalBibColumns=" ++ intercalate "," moreBibFields
apiReturn <- processApiResponse (remoteURL ++ "/bibliography" ++ qDefault archive ++ addJannoColFlag) False
case apiReturn of
ApiReturnBibInfo bibInfo -> return bibInfo
_ -> error "should not happen"

Check warning on line 145 in src/Poseidon/CLI/List.hs

View check run for this annotation

Codecov / codecov/patch

src/Poseidon/CLI/List.hs#L136-L145

Added lines #L136 - L145 were not covered by tests
RepoLocal baseDirs -> do
pacCollection <- readPoseidonPackageCollection pacReadOpts baseDirs
getBibliographyInfo pacCollection addColSpec

let addBibFieldNames = case addColSpec of
AddColAll -> nub . concatMap (map fst . bibInfoAddCols) $ bibInfos
AddColList names -> names

Check warning on line 152 in src/Poseidon/CLI/List.hs

View check run for this annotation

Codecov / codecov/patch

src/Poseidon/CLI/List.hs#L152

Added line #L152 was not covered by tests

-- warning in case the additional Columns do not exist in the entire janno dataset,
-- we only output this warning if the columns were requested explicitly. Not if
-- all columns were requested. We consider such an "all" request to mean "all columns that are present".
case addColSpec of
AddColList (_:_) -> do
forM_ addBibFieldNames $ \bibFieldKey -> do

Check warning on line 159 in src/Poseidon/CLI/List.hs

View check run for this annotation

Codecov / codecov/patch

src/Poseidon/CLI/List.hs#L158-L159

Added lines #L158 - L159 were not covered by tests
-- check entries in all individuals for that key
let nonEmptyEntries = do
bibInfo <- bibInfos
Just (Just _) <- return $ bibFieldKey `lookup` bibInfoAddCols bibInfo
return ()
when (null nonEmptyEntries) . logWarning $
"Bibliography field " ++ bibFieldKey ++ "is not present in any bibliography entry"

Check warning on line 166 in src/Poseidon/CLI/List.hs

View check run for this annotation

Codecov / codecov/patch

src/Poseidon/CLI/List.hs#L161-L166

Added lines #L161 - L166 were not covered by tests
_ -> return ()

let tableH = ["BibKey", "Title", "Author", "Year", "Journal", "Doi",
"Nr of samples"] ++ addBibFieldNames
tableB = do
bibInfo <- bibInfos
let addBibFieldColumns = do
bibFieldName <- addBibFieldNames
case bibFieldName `lookup` bibInfoAddCols bibInfo of
Just (Just v) -> return v
_ -> return ""

Check warning on line 177 in src/Poseidon/CLI/List.hs

View check run for this annotation

Codecov / codecov/patch

src/Poseidon/CLI/List.hs#L173-L177

Added lines #L173 - L177 were not covered by tests
return $ [bibInfoKey bibInfo, curateBibField $ bibInfoTitle bibInfo, curateBibField $ bibInfoAuthor bibInfo,
curateBibField $ bibInfoYear bibInfo, curateBibField $ bibInfoJournal bibInfo,
curateBibField $ bibInfoDoi bibInfo, show (bibInfoNrSamples bibInfo)] ++
addBibFieldColumns

Check warning on line 181 in src/Poseidon/CLI/List.hs

View check run for this annotation

Codecov / codecov/patch

src/Poseidon/CLI/List.hs#L181

Added line #L181 was not covered by tests
return (tableH, tableB)

if rawOutput then
liftIO $ putStrLn $ intercalate "\n" [intercalate "\t" row | row <- tableH:tableB]
else do
Expand All @@ -139,3 +191,8 @@
showMaybe = fromMaybe "n/a"
showMaybeVersion :: Maybe Version -> String
showMaybeVersion = maybe "n/a" showVersion

-- this function is necessary, as BibTeX sometimes has arbitrary line breaks within fields,
-- which we need to get rid of to avoid down-stream problems
curateBibField :: Maybe String -> String
curateBibField = T.unpack . T.intercalate " " . map T.strip . T.lines . T.pack . fromMaybe ""
22 changes: 17 additions & 5 deletions src/Poseidon/CLI/OptparseApplicativeParsers.hs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
GenotypeDataSpec (..),
GenotypeFileSpec (..),
SNPSetSpec (..))
import Poseidon.ServerClient (AddJannoColSpec (..),
import Poseidon.ServerClient (AddColSpec (..),
ArchiveEndpoint (..))
import Poseidon.Utils (ErrorLength (..), LogMode (..),
TestMode (..),
Expand Down Expand Up @@ -636,7 +636,10 @@
OP.value Nothing)

parseListEntity :: OP.Parser ListEntity
parseListEntity = parseListPackages <|> parseListGroups <|> (parseListIndividualsDummy *> parseListIndividualsExtraCols)
parseListEntity = parseListPackages <|>
parseListGroups <|>
(parseListIndividualsDummy *> parseListIndividualsExtraCols) <|>
(parseListBibliographyDummy *> parseListBibliographyExtraFields)

Check warning on line 642 in src/Poseidon/CLI/OptparseApplicativeParsers.hs

View check run for this annotation

Codecov / codecov/patch

src/Poseidon/CLI/OptparseApplicativeParsers.hs#L639-L642

Added lines #L639 - L642 were not covered by tests
where
parseListPackages = OP.flag' ListPackages (
OP.long "packages" <>
Expand All @@ -648,14 +651,23 @@
parseListIndividualsDummy = OP.flag' () (
OP.long "individuals" <>
OP.help "List all individuals/samples.")
parseListIndividualsExtraCols = ListIndividuals <$> (parseAllJannoCols <|> (AddJannoColList <$> OP.many parseExtraCol))
parseAllJannoCols = OP.flag' AddJannoColAll (OP.long "fullJanno" <> OP.help "output all Janno Columns")
parseListIndividualsExtraCols = ListIndividuals <$> (parseAllJannoCols <|> (AddColList <$> OP.many parseExtraCol))
parseAllJannoCols = OP.flag' AddColAll (OP.long "fullJanno" <> OP.help "output all Janno Columns")

Check warning on line 655 in src/Poseidon/CLI/OptparseApplicativeParsers.hs

View check run for this annotation

Codecov / codecov/patch

src/Poseidon/CLI/OptparseApplicativeParsers.hs#L654-L655

Added lines #L654 - L655 were not covered by tests
parseExtraCol = OP.strOption (
OP.short 'j' <>
OP.long "jannoColumn" <>
OP.metavar "COLNAME" <>
OP.help "List additional fields from the janno files, using the .janno column heading name, such as \
\\"Country\", \"Site\", \"Date_C14_Uncal_BP\", etc..")
\\"Country\", \"Site\", \"Date_C14_Uncal_BP\", etc... Can be given multiple times")
parseListBibliographyDummy = OP.flag' () (
OP.long "bibliography" <> OP.help "output bibliography information for packages")
parseListBibliographyExtraFields = ListBibliography <$>
(parseAllBibFields <|> (AddColList <$> OP.many parseExtraBibFields))
parseAllBibFields = OP.flag' AddColAll (OP.long "fullBib" <> OP.help "output all bibliography fields found in any bibliography item")
parseExtraBibFields = OP.strOption (
OP.short 'b' <> OP.long "bibField" <> OP.metavar "BIB-FIELD" <>
OP.help "List information from the given bibliography field, for example \"abstract\" or \"publisher\". Can \

Check warning on line 669 in src/Poseidon/CLI/OptparseApplicativeParsers.hs

View check run for this annotation

Codecov / codecov/patch

src/Poseidon/CLI/OptparseApplicativeParsers.hs#L662-L669

Added lines #L662 - L669 were not covered by tests
\be given multiple times.")

parseRawOutput :: OP.Parser Bool
parseRawOutput = OP.switch (
Expand Down
22 changes: 18 additions & 4 deletions src/Poseidon/CLI/Serve.hs
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,12 @@
PoseidonPackage (..),
defaultPackageReadOptions,
getAllGroupInfo,
getBibliographyInfo,
getExtendedIndividualInfo,
packagesToPackageInfos,
readPoseidonPackageCollection)
import Poseidon.PoseidonVersion (minimalRequiredClientVersion)
import Poseidon.ServerClient (AddJannoColSpec (..),
import Poseidon.ServerClient (AddColSpec (..),
ApiReturnData (..),
ServerApiReturnType (..))
import Poseidon.Utils (LogA, PoseidonIO, envLogAction,
Expand Down Expand Up @@ -123,14 +124,27 @@
pacs <- getItemFromArchiveStore archiveStore
maybeAdditionalColumnsString <- (Just <$> param "additionalJannoColumns") `rescue` (\_ -> return Nothing)
indInfo <- case maybeAdditionalColumnsString of
Just "ALL" -> getExtendedIndividualInfo pacs AddJannoColAll -- Nothing means all Janno Columns
Just "ALL" -> getExtendedIndividualInfo pacs AddColAll -- Nothing means all Janno Columns
Just additionalColumnsString ->
let additionalColumnNames = splitOn "," additionalColumnsString
in getExtendedIndividualInfo pacs (AddJannoColList additionalColumnNames)
Nothing -> getExtendedIndividualInfo pacs (AddJannoColList [])
in getExtendedIndividualInfo pacs (AddColList additionalColumnNames)
Nothing -> getExtendedIndividualInfo pacs (AddColList [])
let retData = ApiReturnExtIndividualInfo indInfo
return $ ServerApiReturnType [] (Just retData)

get "/bibliography" . conditionOnClientVersion $ do
logRequest logA
pacs <- getItemFromArchiveStore archiveStore
maybeAdditionalBibFieldsString <- (Just <$> param "additionalBibColumns") `rescue` (\_ -> return Nothing)
bibInfo <- case maybeAdditionalBibFieldsString of
Just "ALL" -> getBibliographyInfo pacs AddColAll -- Nothing means all Janno Columns

Check warning on line 140 in src/Poseidon/CLI/Serve.hs

View check run for this annotation

Codecov / codecov/patch

src/Poseidon/CLI/Serve.hs#L136-L140

Added lines #L136 - L140 were not covered by tests
Just additionalBibFieldsString ->
let additionalBibFields = splitOn "," additionalBibFieldsString
in getBibliographyInfo pacs (AddColList additionalBibFields)
Nothing -> getBibliographyInfo pacs (AddColList [])
let retData = ApiReturnBibInfo bibInfo
return $ ServerApiReturnType [] (Just retData)

Check warning on line 146 in src/Poseidon/CLI/Serve.hs

View check run for this annotation

Codecov / codecov/patch

src/Poseidon/CLI/Serve.hs#L142-L146

Added lines #L142 - L146 were not covered by tests

-- API for retreiving package zip files
when (isJust maybeZipPath) . get "/zip_file/:package_name" $ do
logRequest logA
Expand Down
39 changes: 34 additions & 5 deletions src/Poseidon/Package.hs
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,13 @@
packagesToPackageInfos,
getAllGroupInfo,
validateGeno,
filterToRelevantPackages
filterToRelevantPackages,
getBibliographyInfo
) where

import Poseidon.BibFile (BibEntry (..), BibTeX,
readBibTeXFile)
import Poseidon.ColumnTypes (JannoPublication (..))
import Poseidon.Contributor (ContributorSpec (..))
import Poseidon.EntityTypes (EntitySpec, HasNameAndVersion (..),
IndividualInfo (..),
Expand Down Expand Up @@ -59,7 +61,8 @@
SeqSourceRow (..),
SeqSourceRows (..),
readSeqSourceFile)
import Poseidon.ServerClient (AddJannoColSpec (..),
import Poseidon.ServerClient (AddColSpec (..),
BibliographyInfo (..),
ExtendedIndividualInfo (..),
GroupInfo (..), PackageInfo (..))
import Poseidon.Utils (LogA, PoseidonException (..),
Expand Down Expand Up @@ -90,6 +93,7 @@
(\\))
import Data.Maybe (catMaybes, fromMaybe, isNothing,
mapMaybe)
import Data.Text (unpack)
import Data.Time (Day, UTCTime (..), getCurrentTime)
import qualified Data.Vector as V
import Data.Version (Version (..), makeVersion)
Expand Down Expand Up @@ -838,16 +842,16 @@
return (map fst . concat $ indInfoLatestPairs, map snd . concat $ indInfoLatestPairs)


getExtendedIndividualInfo :: (MonadThrow m) => [PoseidonPackage] -> AddJannoColSpec -> m [ExtendedIndividualInfo]
getExtendedIndividualInfo :: (MonadThrow m) => [PoseidonPackage] -> AddColSpec -> m [ExtendedIndividualInfo]
getExtendedIndividualInfo allPackages addJannoColSpec = sequence $ do -- list monad
pac <- allPackages -- outer loop (automatically concatenating over inner loops)
jannoRow <- getJannoRowsFromPac pac -- inner loop
let name = jPoseidonID jannoRow
groups = map show $ getListColumn . jGroupName $ jannoRow
colNames = case addJannoColSpec of
AddJannoColAll -> jannoHeaderString \\ ["Poseidon_ID", "Group_Name"] -- Nothing means all Janno columns
AddColAll -> jannoHeaderString \\ ["Poseidon_ID", "Group_Name"] -- Nothing means all Janno columns
-- except for these two which are already explicit
AddJannoColList c -> c
AddColList c -> c
additionalColumnEntries = [(k, BSC.unpack <$> toNamedRecord jannoRow HM.!? BSC.pack k) | k <- colNames]
isLatest <- isLatestInCollection allPackages pac -- this lives in monad m
-- double-return for m and then list.
Expand All @@ -859,3 +863,28 @@
indInfoCollection <- getJointIndividualInfo packages
relevantPacs <- determineRelevantPackages entities indInfoCollection
return $ filter (\p -> makePacNameAndVersion p `elem` relevantPacs) packages

getBibliographyInfo :: (MonadThrow m) => [PoseidonPackage] -> AddColSpec -> m [BibliographyInfo]
getBibliographyInfo allPackages addColSpec = do
allLatestPacs <- filterM (isLatestInCollection allPackages) allPackages
-- we use only latest packages for this particular feature, otherwise interpretability gets weird.
let allBibEntries = nub . sortOn bibEntryId . concatMap posPacBib $ allLatestPacs
let jointJanno = getJannoRows $ getJointJanno allLatestPacs
forM allBibEntries $ \(BibEntry _ bibId bibFields) -> do
let nrSamples = length $ do -- list monad over samples
jannoRow <- jointJanno
let bibKeys = case jPublication jannoRow of
Nothing -> []

Check warning on line 877 in src/Poseidon/Package.hs

View check run for this annotation

Codecov / codecov/patch

src/Poseidon/Package.hs#L877

Added line #L877 was not covered by tests
Just jannoPubList -> map (\(JannoPublication p) -> unpack p) $ getListColumn jannoPubList
True <- return $ bibId `elem` bibKeys
return ()
let addBibEntries = case addColSpec of
-- with "all" we include all existing additional bib-entries except for the canonical ones that we anyway look up.
AddColAll -> [(k, Just v) | (k, v) <- bibFields, k `notElem` ["title", "author", "year", "journal", "doi"]]
-- with a selecton of colNames we just query the bib-fields for those exact fields.
AddColList colNames -> [(colName, colName `lookup` bibFields) | colName <- colNames]

Check warning on line 885 in src/Poseidon/Package.hs

View check run for this annotation

Codecov / codecov/patch

src/Poseidon/Package.hs#L885

Added line #L885 was not covered by tests
return $ BibliographyInfo nrSamples bibId ("title" `lookup` bibFields)
("author" `lookup` bibFields) ("year" `lookup` bibFields) ("journal" `lookup` bibFields)
("doi" `lookup` bibFields) addBibEntries


Loading
Loading