From 6a1d3c2cfd3ee77eaa6bdb6d660a5c8717d441ce Mon Sep 17 00:00:00 2001 From: Adam Gundry Date: Tue, 17 Aug 2021 21:26:42 +0100 Subject: [PATCH] WIP: migrate search engine to full-text-search package (#748) --- Distribution/Server/Features/Search.hs | 18 +- Distribution/Server/Features/Search/BM25F.hs | 191 -------- .../Server/Features/Search/DocFeatVals.hs | 26 -- .../Server/Features/Search/DocIdSet.hs | 201 --------- .../Server/Features/Search/DocTermIds.hs | 64 --- .../Server/Features/Search/PkgSearch.hs | 17 +- .../Server/Features/Search/SearchEngine.hs | 405 ----------------- .../Server/Features/Search/SearchIndex.hs | 414 ------------------ .../Server/Features/Search/TermBag.hs | 76 ---- Distribution/Server/Framework/MemSize.hs | 7 + hackage-server.cabal | 8 +- 11 files changed, 27 insertions(+), 1400 deletions(-) delete mode 100644 Distribution/Server/Features/Search/BM25F.hs delete mode 100644 Distribution/Server/Features/Search/DocFeatVals.hs delete mode 100644 Distribution/Server/Features/Search/DocIdSet.hs delete mode 100644 Distribution/Server/Features/Search/DocTermIds.hs delete mode 100644 Distribution/Server/Features/Search/SearchEngine.hs delete mode 100644 Distribution/Server/Features/Search/SearchIndex.hs delete mode 100644 Distribution/Server/Features/Search/TermBag.hs diff --git a/Distribution/Server/Features/Search.hs b/Distribution/Server/Features/Search.hs index f5f22987c..43b4fb406 100644 --- a/Distribution/Server/Features/Search.hs +++ b/Distribution/Server/Features/Search.hs @@ -7,7 +7,7 @@ module Distribution.Server.Features.Search ( defaultSearchRankParameters, SearchEngine.SearchRankParameters(..), PkgDocField, PkgDocFeatures, - BM25F.Explanation(..), + SearchEngine.Explanation(..), ) where import Distribution.Server.Framework @@ -17,9 +17,7 @@ import Distribution.Server.Features.Core import Distribution.Server.Features.PackageList import Distribution.Server.Features.Search.PkgSearch -import Distribution.Server.Features.Search.SearchEngine (SearchRankParameters(..)) -import qualified Distribution.Server.Features.Search.SearchEngine as SearchEngine -import qualified Distribution.Server.Features.Search.BM25F as BM25F +import qualified Data.SearchEngine as SearchEngine import qualified Distribution.Server.Packages.PackageIndex as PackageIndex import Distribution.Server.Packages.Types @@ -40,10 +38,10 @@ data SearchFeature = SearchFeature { searchPackages :: forall m. MonadIO m => [String] -> m [PackageName], searchPackagesExplain :: forall m. MonadIO m - => SearchRankParameters PkgDocField PkgDocFeatures + => PkgSearchRankParameters -> [String] -> m (Maybe PackageName, - [(BM25F.Explanation PkgDocField PkgDocFeatures T.Text + [(SearchEngine.Explanation PkgDocField PkgDocFeatures T.Text ,PackageName)]) } @@ -62,7 +60,7 @@ initSearchFeature env@ServerEnv{serverTemplatesDir, serverTemplatesMode} = do searchEngineState <- newMemStateWHNF initialPkgSearchEngine - return $ \core@CoreFeature{..} list -> do + return $ \core list -> do let feature = searchFeature env core list searchEngineState templates @@ -157,15 +155,15 @@ searchFeature ServerEnv{serverBaseURI} CoreFeature{..} ListFeature{getAllLists} return results searchPackagesExplain :: MonadIO m - => SearchRankParameters PkgDocField PkgDocFeatures + => PkgSearchRankParameters -> [String] - -> m (Maybe PackageName, [(BM25F.Explanation PkgDocField PkgDocFeatures T.Text, PackageName)]) + -> m (Maybe PackageName, [(SearchEngine.Explanation PkgDocField PkgDocFeatures T.Text, PackageName)]) searchPackagesExplain params terms = do se <- readMemState searchEngineState let results = SearchEngine.queryExplain (SearchEngine.setRankParams params se) (map T.pack terms) - return results + return (Nothing, results) -- TODO: no exact match available from the full-text-search version of queryExplain handlerGetOpenSearch :: DynamicPath -> ServerPartE Response handlerGetOpenSearch _ = do diff --git a/Distribution/Server/Features/Search/BM25F.hs b/Distribution/Server/Features/Search/BM25F.hs deleted file mode 100644 index 1b4189dbf..000000000 --- a/Distribution/Server/Features/Search/BM25F.hs +++ /dev/null @@ -1,191 +0,0 @@ -{-# LANGUAGE RecordWildCards #-} - --- | See: --- --- * \"The Probabilistic Relevance Framework: BM25 and Beyond\" --- --- --- * \"An Introduction to Information Retrieval\" --- --- -module Distribution.Server.Features.Search.BM25F ( - Context(..), - FeatureFunction(..), - Doc(..), - score, - - Explanation(..), - explain, - ) where - -import Data.Ix - -data Context term field feature = Context { - numDocsTotal :: !Int, - avgFieldLength :: field -> Float, - numDocsWithTerm :: term -> Int, - paramK1 :: !Float, - paramB :: field -> Float, - -- consider minimum length to prevent massive B bonus? - fieldWeight :: field -> Float, - featureWeight :: feature -> Float, - featureFunction :: feature -> FeatureFunction - } - -data Doc term field feature = Doc { - docFieldLength :: field -> Int, - docFieldTermFrequency :: field -> term -> Int, - docFeatureValue :: feature -> Float - } - - --- | The BM25F score for a document for a given set of terms. --- -score :: (Ix field, Bounded field, Ix feature, Bounded feature) => - Context term field feature -> - Doc term field feature -> [term] -> Float -score ctx doc terms = - sum (map (weightedTermScore ctx doc) terms) - + sum (map (weightedNonTermScore ctx doc) features) - - where - features = range (minBound, maxBound) - - -weightedTermScore :: (Ix field, Bounded field) => - Context term field feature -> - Doc term field feature -> term -> Float -weightedTermScore ctx doc t = - weightIDF ctx t * tf' - / (k1 + tf') - where - tf' = weightedDocTermFrequency ctx doc t - k1 = paramK1 ctx - - -weightIDF :: Context term field feature -> term -> Float -weightIDF ctx t = - log ((n - n_t + 0.5) / (n_t + 0.5)) - where - n = fromIntegral (numDocsTotal ctx) - n_t = fromIntegral (numDocsWithTerm ctx t) - - -weightedDocTermFrequency :: (Ix field, Bounded field) => - Context term field feature -> - Doc term field feature -> term -> Float -weightedDocTermFrequency ctx doc t = - sum [ w_f * tf_f / _B_f - | field <- range (minBound, maxBound) - , let w_f = fieldWeight ctx field - tf_f = fromIntegral (docFieldTermFrequency doc field t) - _B_f = lengthNorm ctx doc field - ] - - -lengthNorm :: Context term field feature -> - Doc term field feature -> field -> Float -lengthNorm ctx doc field = - (1-b_f) + b_f * sl_f / avgsl_f - where - b_f = paramB ctx field - sl_f = fromIntegral (docFieldLength doc field) - avgsl_f = avgFieldLength ctx field - - -weightedNonTermScore :: (Ix feature, Bounded feature) => - Context term field feature -> - Doc term field feature -> feature -> Float -weightedNonTermScore ctx doc feature = - w_f * _V_f f_f - where - w_f = featureWeight ctx feature - _V_f = applyFeatureFunction (featureFunction ctx feature) - f_f = docFeatureValue doc feature - - -data FeatureFunction - = LogarithmicFunction Float -- ^ @log (\lambda_i + f_i)@ - | RationalFunction Float -- ^ @f_i / (\lambda_i + f_i)@ - | SigmoidFunction Float Float -- ^ @1 / (\lambda + exp(-(\lambda' * f_i))@ - -applyFeatureFunction :: FeatureFunction -> (Float -> Float) -applyFeatureFunction (LogarithmicFunction p1) = \fi -> log (p1 + fi) -applyFeatureFunction (RationalFunction p1) = \fi -> fi / (p1 + fi) -applyFeatureFunction (SigmoidFunction p1 p2) = \fi -> 1 / (p1 + exp (-fi * p2)) - - ------------------- --- Explanation --- - --- | A breakdown of the BM25F score, to explain somewhat how it relates to --- the inputs, and so you can compare the scores of different documents. --- -data Explanation field feature term = Explanation { - -- | The overall score is the sum of the 'termScores', 'positionScore' - -- and 'nonTermScore' - overallScore :: Float, - - -- | There is a score contribution from each query term. This is the - -- score for the term across all fields in the document (but see - -- 'termFieldScores'). - termScores :: [(term, Float)], -{- - -- | There is a score contribution for positional information. Terms - -- appearing in the document close together give a bonus. - positionScore :: [(field, Float)], --} - -- | The document can have an inate bonus score independent of the terms - -- in the query. For example this might be a popularity score. - nonTermScores :: [(feature, Float)], - - -- | This does /not/ contribute to the 'overallScore'. It is an - -- indication of how the 'termScores' relates to per-field scores. - -- Note however that the term score for all fields is /not/ simply - -- sum of the per-field scores. The point of the BM25F scoring function - -- is that a linear combination of per-field scores is wrong, and BM25F - -- does a more cunning non-linear combination. - -- - -- However, it is still useful as an indication to see scores for each - -- field for a term, to see how the compare. - -- - termFieldScores :: [(term, [(field, Float)])] - } - deriving Show - -instance Functor (Explanation field feature) where - fmap f e@Explanation{..} = - e { - termScores = [ (f t, s) | (t, s) <- termScores ], - termFieldScores = [ (f t, fs) | (t, fs) <- termFieldScores ] - } - -explain :: (Ix field, Bounded field, Ix feature, Bounded feature) => - Context term field feature -> - Doc term field feature -> [term] -> Explanation field feature term -explain ctx doc ts = - Explanation {..} - where - overallScore = sum (map snd termScores) --- + sum (map snd positionScore) - + sum (map snd nonTermScores) - termScores = [ (t, weightedTermScore ctx doc t) | t <- ts ] --- positionScore = [ (f, 0) | f <- range (minBound, maxBound) ] - nonTermScores = [ (feature, weightedNonTermScore ctx doc feature) - | feature <- range (minBound, maxBound) ] - - termFieldScores = - [ (t, fieldScores) - | t <- ts - , let fieldScores = - [ (f, weightedTermScore ctx' doc t) - | f <- range (minBound, maxBound) - , let ctx' = ctx { fieldWeight = fieldWeightOnly f } - ] - ] - fieldWeightOnly f f' | sameField f f' = fieldWeight ctx f' - | otherwise = 0 - - sameField f f' = index (minBound, maxBound) f - == index (minBound, maxBound) f' diff --git a/Distribution/Server/Features/Search/DocFeatVals.hs b/Distribution/Server/Features/Search/DocFeatVals.hs deleted file mode 100644 index 096364276..000000000 --- a/Distribution/Server/Features/Search/DocFeatVals.hs +++ /dev/null @@ -1,26 +0,0 @@ -{-# LANGUAGE BangPatterns, GeneralizedNewtypeDeriving #-} -module Distribution.Server.Features.Search.DocFeatVals ( - DocFeatVals, - featureValue, - create, - ) where - -import Distribution.Server.Features.Search.DocTermIds (vecIndexIx, vecCreateIx) -import Distribution.Server.Framework.MemSize -import Data.Vector (Vector) -import Data.Ix (Ix) - - --- | Storage for the non-term feature values i a document. --- -newtype DocFeatVals feature = DocFeatVals (Vector Float) - deriving (Show, MemSize) - -featureValue :: (Ix feature, Bounded feature) => DocFeatVals feature -> feature -> Float -featureValue (DocFeatVals featVec) = vecIndexIx featVec - -create :: (Ix feature, Bounded feature) => - (feature -> Float) -> DocFeatVals feature -create docFeatVals = - DocFeatVals (vecCreateIx docFeatVals) - diff --git a/Distribution/Server/Features/Search/DocIdSet.hs b/Distribution/Server/Features/Search/DocIdSet.hs deleted file mode 100644 index f2af19357..000000000 --- a/Distribution/Server/Features/Search/DocIdSet.hs +++ /dev/null @@ -1,201 +0,0 @@ -{-# LANGUAGE BangPatterns, GeneralizedNewtypeDeriving, TypeFamilies, MultiParamTypeClasses #-} - -module Distribution.Server.Features.Search.DocIdSet ( - DocId, - DocIdSet, - null, - size, - empty, - singleton, - fromList, - toList, - toSet, - insert, - delete, - union, - invariant, - ) where - -import Distribution.Server.Framework.MemSize - -import Data.Word -import qualified Data.Vector.Unboxed as VU -import qualified Data.Vector.Unboxed.Mutable as MVU -import qualified Data.Vector.Generic.Base as VG -import qualified Data.Vector.Generic.Mutable as MVG -import Control.Monad (liftM) -import Control.Monad.ST -import Data.Set (Set) -import qualified Data.Set as Set - -import Prelude hiding (null) - ---import Test.QuickCheck ---import qualified Data.List as List - -newtype DocId = DocId Word32 - deriving (Eq, Ord, Show, Enum, Bounded, MemSize, VU.Unbox) - -newtype DocIdSet = DocIdSet (VU.Vector DocId) - deriving (Eq, Show) - - -newtype instance MVU.MVector s DocId = MVU_DocId (MVU.MVector s Word32) -newtype instance VU.Vector DocId = VU_DocId (VU.Vector Word32) - -instance VG.Vector VU.Vector DocId where --- fixme, can i do a noop/coerce/unsafecoerce instead of these fmaps on newtypes? --- or does it even matter? - basicUnsafeFreeze (MVU_DocId mvw)= VU_DocId `liftM` VG.basicUnsafeFreeze mvw - basicUnsafeThaw (VU_DocId vw) = MVU_DocId `liftM` VG.basicUnsafeThaw vw - basicLength (VU_DocId vw) = VG.basicLength vw - basicUnsafeSlice start end (VU_DocId vw) = VU_DocId $ VG.basicUnsafeSlice start end vw - basicUnsafeIndexM (VU_DocId vw) ix = DocId `liftM` VG.basicUnsafeIndexM vw ix - - -- basicLength, basicUnsafeSlice, basicOverlaps, basicUnsafeNew, basicUnsafeRead, basicUnsafeWrite -instance MVG.MVector MVU.MVector DocId where - basicLength (MVU_DocId vw) = MVG.basicLength vw - basicUnsafeSlice start end (MVU_DocId vw) = MVU_DocId $ MVG.basicUnsafeSlice start end vw - basicOverlaps (MVU_DocId vw1) (MVU_DocId vw2) = MVG.basicOverlaps vw1 vw2 - basicUnsafeNew sz = MVU_DocId `liftM` MVG.basicUnsafeNew sz - basicUnsafeRead (MVU_DocId vw) ix = DocId `liftM` MVG.basicUnsafeRead vw ix - basicUnsafeWrite (MVU_DocId vw) ix (DocId word) = MVG.basicUnsafeWrite vw ix word - basicInitialize (MVU_DocId vw) = MVG.basicInitialize vw - --- represented as a sorted sequence of ids -invariant :: DocIdSet -> Bool -invariant (DocIdSet vec) = - strictlyAscending (VU.toList vec) - where - strictlyAscending (a:xs@(b:_)) = a < b && strictlyAscending xs - strictlyAscending _ = True - - -size :: DocIdSet -> Int -size (DocIdSet vec) = VU.length vec - -null :: DocIdSet -> Bool -null (DocIdSet vec) = VU.null vec - -empty :: DocIdSet -empty = DocIdSet VU.empty - -singleton :: DocId -> DocIdSet -singleton = DocIdSet . VU.singleton - -fromList :: [DocId] -> DocIdSet -fromList = DocIdSet . VU.fromList . Set.toAscList . Set.fromList - -toList :: DocIdSet -> [DocId] -toList (DocIdSet vec) = VU.toList vec - -toSet :: DocIdSet -> Set DocId -toSet (DocIdSet vec) = Set.fromDistinctAscList (VU.toList vec) - -insert :: DocId -> DocIdSet -> DocIdSet -insert x (DocIdSet vec) = - case binarySearch vec 0 (VU.length vec - 1) x of - (_, True) -> DocIdSet vec - (i, False) -> case VU.splitAt i vec of - (before, after) -> - DocIdSet (VU.concat [before, VU.singleton x, after]) - -delete :: DocId -> DocIdSet -> DocIdSet -delete x (DocIdSet vec) = - case binarySearch vec 0 (VU.length vec - 1) x of - (_, False) -> DocIdSet vec - (i, True) -> case VU.splitAt i vec of - (before, after) -> - DocIdSet (before VU.++ VU.tail after) - -binarySearch :: VU.Vector DocId -> Int -> Int -> DocId -> (Int, Bool) -binarySearch vec !a !b !key - | a > b = (a, False) - | otherwise = - let mid = (a + b) `div` 2 - in case compare key (vec VU.! mid) of - LT -> binarySearch vec a (mid-1) key - EQ -> (mid, True) - GT -> binarySearch vec (mid+1) b key - -union :: DocIdSet -> DocIdSet -> DocIdSet -union x y | null x = y - | null y = x -union (DocIdSet xs) (DocIdSet ys) = - DocIdSet (VU.create (MVU.new sizeBound >>= writeMerged xs ys)) - where - sizeBound = VU.length xs + VU.length ys - -writeMerged :: VU.Vector DocId -> VU.Vector DocId -> - MVU.MVector s DocId -> ST s (MVU.MVector s DocId) -writeMerged xs0 ys0 out = do - i <- go xs0 ys0 0 - return $! MVU.take i out - where - go !xs !ys !i - | VU.null xs = do VU.copy (MVU.slice i (VU.length ys) out) ys; - return (i + VU.length ys) - | VU.null ys = do VU.copy (MVU.slice i (VU.length xs) out) xs; - return (i + VU.length xs) - | otherwise = let x = VU.head xs; y = VU.head ys - in case compare x y of - GT -> do MVU.write out i y - go xs (VU.tail ys) (i+1) - EQ -> do MVU.write out i x - go (VU.tail xs) (VU.tail ys) (i+1) - LT -> do MVU.write out i x - go (VU.tail xs) ys (i+1) - -instance MemSize DocIdSet where - memSize (DocIdSet vec) = memSizeUVector 2 vec - - -------------- --- tests --- -{- -instance Arbitrary DocIdSet where - arbitrary = fromList `fmap` (listOf arbitrary) - -instance Arbitrary DocId where - arbitrary = DocId `fmap` choose (0,15) - - -prop_insert :: DocIdSet -> DocId -> Bool -prop_insert dset x = - let dset' = insert x dset - in invariant dset && invariant dset' - && all (`member` dset') (x : toList dset) - -prop_delete :: DocIdSet -> DocId -> Bool -prop_delete dset x = - let dset' = DocIdSet.delete x dset - in invariant dset && invariant dset' - && all (`member` dset') (List.delete x (toList dset)) - && not (x `member` dset') - -prop_delete' :: DocIdSet -> Bool -prop_delete' dset = - all (prop_delete dset) (toList dset) - -prop_union :: DocIdSet -> DocIdSet -> Bool -prop_union dset1 dset2 = - let dset = union dset1 dset2 - dset' = fromList (List.union (toList dset1) (toList dset2)) - - in invariant dset && invariant dset' - && dset == dset' - -prop_union' :: DocIdSet -> DocIdSet -> Bool -prop_union' dset1 dset2 = - let dset = union dset1 dset2 - dset' = List.foldl' (\s i -> insert i s) dset1 (toList dset2) - dset'' = List.foldl' (\s i -> insert i s) dset2 (toList dset1) - in invariant dset && invariant dset' && invariant dset'' - && dset == dset' - && dset' == dset'' - -member :: DocId -> DocIdSet -> Bool -member x (DocIdSet vec) = - x `List.elem` VU.toList vec --} diff --git a/Distribution/Server/Features/Search/DocTermIds.hs b/Distribution/Server/Features/Search/DocTermIds.hs deleted file mode 100644 index b59edc579..000000000 --- a/Distribution/Server/Features/Search/DocTermIds.hs +++ /dev/null @@ -1,64 +0,0 @@ -{-# LANGUAGE BangPatterns, GeneralizedNewtypeDeriving, ScopedTypeVariables #-} -module Distribution.Server.Features.Search.DocTermIds ( - DocTermIds, - TermId, - fieldLength, - fieldTermCount, - fieldElems, - create, - vecIndexIx, - vecCreateIx, - ) where - -import Distribution.Server.Features.Search.TermBag (TermBag, TermId) -import qualified Distribution.Server.Features.Search.TermBag as TermBag - -import Distribution.Server.Framework.MemSize - -import Data.Vector (Vector, (!)) -import qualified Data.Vector as Vec -import Data.Ix (Ix) -import qualified Data.Ix as Ix - - --- | The 'TermId's for the 'Term's that occur in a document. Documents may have --- multiple fields and the 'DocTerms' type holds them separately for each field. --- -newtype DocTermIds field = DocTermIds (Vector TermBag) - deriving (Show, MemSize) - -getField :: (Ix field, Bounded field) => DocTermIds field -> field -> TermBag -getField (DocTermIds fieldVec) = vecIndexIx fieldVec - -create :: (Ix field, Bounded field) => - (field -> [TermId]) -> DocTermIds field -create docTermIds = - DocTermIds (vecCreateIx (TermBag.fromList . docTermIds)) - --- | The number of terms in a field within the document. -fieldLength :: (Ix field, Bounded field) => DocTermIds field -> field -> Int -fieldLength docterms field = - TermBag.size (getField docterms field) - --- | The frequency of a particular term in a field within the document. -fieldTermCount :: (Ix field, Bounded field) => DocTermIds field -> field -> TermId -> Int -fieldTermCount docterms field termid = - TermBag.termCount (getField docterms field) termid - -fieldElems :: (Ix field, Bounded field) => DocTermIds field -> field -> [TermId] -fieldElems docterms field = - TermBag.elems (getField docterms field) - ---------------------------------- --- Vector indexed by Ix Bounded --- - -vecIndexIx :: forall ix a . (Ix ix, Bounded ix) => Vector a -> ix -> a -vecIndexIx vec ix = vec ! Ix.index (minBound :: ix, maxBound :: ix) ix - -vecCreateIx :: forall ix a . (Ix ix, Bounded ix) => (ix -> a) -> Vector a -vecCreateIx f = Vec.fromListN (Ix.rangeSize bounds) - [ y | ix <- Ix.range bounds, let !y = f ix ] - where - bounds :: (ix, ix) - bounds = (minBound, maxBound) diff --git a/Distribution/Server/Features/Search/PkgSearch.hs b/Distribution/Server/Features/Search/PkgSearch.hs index 0e1fd8832..895c6f41c 100644 --- a/Distribution/Server/Features/Search/PkgSearch.hs +++ b/Distribution/Server/Features/Search/PkgSearch.hs @@ -2,16 +2,19 @@ module Distribution.Server.Features.Search.PkgSearch ( PkgSearchEngine, + PkgSearchRankParameters, initialPkgSearchEngine, defaultSearchRankParameters, PkgDocField(..), PkgDocFeatures, ) where -import Distribution.Server.Features.Search.SearchEngine +-- import Distribution.Server.Features.Search.SearchEngine import Distribution.Server.Features.Search.ExtractNameTerms import Distribution.Server.Features.Search.ExtractDescriptionTerms +import Data.SearchEngine + import Data.Ix import Data.Set (Set) import qualified Data.Set as Set @@ -23,7 +26,6 @@ import Distribution.Package import Distribution.PackageDescription import Distribution.Utils.ShortText import Distribution.Text (display) -import Data.Text (unpack) type PkgSearchEngine = SearchEngine @@ -52,8 +54,7 @@ pkgSearchConfig = documentKey = packageName . fst, extractDocumentTerms = extractTokens . fst, transformQueryTerm = normaliseQueryToken, - documentFeatureValue = getFeatureValue, - makeKey = mkPackageName . unpack + documentFeatureValue = getFeatureValue } where extractTokens :: PackageDescription -> PkgDocField -> [Text] @@ -74,7 +75,9 @@ pkgSearchConfig = getFeatureValue (_pkg, downloadcount) Downloads = fromIntegral downloadcount -defaultSearchRankParameters :: SearchRankParameters PkgDocField PkgDocFeatures +type PkgSearchRankParameters = SearchRankParameters PkgDocField PkgDocFeatures + +defaultSearchRankParameters :: PkgSearchRankParameters defaultSearchRankParameters = SearchRankParameters { paramK1, @@ -83,7 +86,9 @@ defaultSearchRankParameters = paramFeatureWeights, paramFeatureFunctions, paramResultsetSoftLimit = 400, - paramResultsetHardLimit = 800 + paramResultsetHardLimit = 800, + paramAutosuggestPrefilterLimit = 1000, -- TODO: what should these limits be? + paramAutosuggestPostfilterLimit = 1000 } where paramK1 :: Float diff --git a/Distribution/Server/Features/Search/SearchEngine.hs b/Distribution/Server/Features/Search/SearchEngine.hs deleted file mode 100644 index f2cfa4078..000000000 --- a/Distribution/Server/Features/Search/SearchEngine.hs +++ /dev/null @@ -1,405 +0,0 @@ -{-# LANGUAGE BangPatterns, NamedFieldPuns, RecordWildCards #-} -{-# LANGUAGE ScopedTypeVariables #-} - -module Distribution.Server.Features.Search.SearchEngine ( - SearchEngine, - SearchConfig(..), - SearchRankParameters(..), - BM25F.FeatureFunction(..), - Term, - initSearchEngine, - insertDoc, - insertDocs, - deleteDoc, - query, - - NoFeatures, - noFeatures, - - queryExplain, - BM25F.Explanation(..), - setRankParams, - - invariant, - ) where - -import Distribution.Server.Features.Search.SearchIndex (SearchIndex, Term, TermId) -import qualified Distribution.Server.Features.Search.SearchIndex as SI -import Distribution.Server.Features.Search.DocIdSet (DocIdSet, DocId) -import qualified Distribution.Server.Features.Search.DocIdSet as DocIdSet -import Distribution.Server.Features.Search.DocTermIds (DocTermIds) -import qualified Distribution.Server.Features.Search.DocTermIds as DocTermIds -import Distribution.Server.Features.Search.DocFeatVals (DocFeatVals) -import qualified Distribution.Server.Features.Search.DocFeatVals as DocFeatVals -import qualified Distribution.Server.Features.Search.BM25F as BM25F - -import Distribution.Server.Framework.MemSize - -import Data.Ix -import Data.Array.Unboxed -import Data.List -import Data.Function -import Data.Maybe - -------------------- --- Doc layer --- --- That is, at the layer of documents, so covering the issues of: --- - inserting/removing whole documents --- - documents having multiple fields --- - documents having multiple terms --- - transformations (case-fold/normalisation/stemming) on the doc terms --- - transformations on the search terms --- - -data SearchConfig doc key field feature = SearchConfig { - documentKey :: doc -> key, - extractDocumentTerms :: doc -> field -> [Term], - transformQueryTerm :: Term -> field -> Term, - documentFeatureValue :: doc -> feature -> Float, - makeKey :: Term -> key - - } - -data SearchRankParameters field feature = SearchRankParameters { - paramK1 :: !Float, - paramB :: field -> Float, - paramFieldWeights :: field -> Float, - paramFeatureWeights :: feature -> Float, - paramFeatureFunctions :: feature -> BM25F.FeatureFunction, - paramResultsetSoftLimit :: !Int, - paramResultsetHardLimit :: !Int - } - -data SearchEngine doc key field feature = SearchEngine { - searchIndex :: !(SearchIndex key field feature), - searchConfig :: !(SearchConfig doc key field feature), - searchRankParams :: !(SearchRankParameters field feature), - - -- cached info - sumFieldLengths :: !(UArray field Int), - bm25Context :: BM25F.Context TermId field feature - } - -initSearchEngine :: (Ix field, Bounded field, Ix feature, Bounded feature) => - SearchConfig doc key field feature -> - SearchRankParameters field feature -> - SearchEngine doc key field feature -initSearchEngine config params = - cacheBM25Context - SearchEngine { - searchIndex = SI.emptySearchIndex, - searchConfig = config, - searchRankParams = params, - sumFieldLengths = listArray (minBound, maxBound) (repeat 0), - -- FIXME this use of undefined bears explaining - bm25Context = undefined - } - -setRankParams :: SearchRankParameters field feature -> - SearchEngine doc key field feature -> - SearchEngine doc key field feature -setRankParams params@SearchRankParameters{..} se = - se { - searchRankParams = params, - bm25Context = (bm25Context se) { - BM25F.paramK1 = paramK1, - BM25F.paramB = paramB, - BM25F.fieldWeight = paramFieldWeights, - BM25F.featureWeight = paramFeatureWeights, - BM25F.featureFunction = paramFeatureFunctions - } - } - -invariant :: (Ord key, Ix field, Bounded field) => - SearchEngine doc key field feature -> Bool -invariant SearchEngine{searchIndex} = - SI.invariant searchIndex --- && check caches - -cacheBM25Context :: Ix field => - SearchEngine doc key field feature -> - SearchEngine doc key field feature -cacheBM25Context - se@SearchEngine { - searchRankParams = SearchRankParameters{..}, - searchIndex, - sumFieldLengths - } - = se { bm25Context = bm25Context' } - where - bm25Context' = BM25F.Context { - BM25F.numDocsTotal = SI.docCount searchIndex, - BM25F.avgFieldLength = \f -> fromIntegral (sumFieldLengths ! f) - / fromIntegral (SI.docCount searchIndex), - BM25F.numDocsWithTerm = DocIdSet.size . SI.lookupTermId searchIndex, - BM25F.paramK1 = paramK1, - BM25F.paramB = paramB, - BM25F.fieldWeight = paramFieldWeights, - BM25F.featureWeight = paramFeatureWeights, - BM25F.featureFunction = paramFeatureFunctions - } - -updateCachedFieldLengths :: (Ix field, Bounded field) => - Maybe (DocTermIds field) -> Maybe (DocTermIds field) -> - SearchEngine doc key field feature -> - SearchEngine doc key field feature -updateCachedFieldLengths Nothing (Just newDoc) se@SearchEngine{sumFieldLengths} = - se { - sumFieldLengths = - array (bounds sumFieldLengths) - [ (i, n + DocTermIds.fieldLength newDoc i) - | (i, n) <- assocs sumFieldLengths ] - } -updateCachedFieldLengths (Just oldDoc) (Just newDoc) se@SearchEngine{sumFieldLengths} = - se { - sumFieldLengths = - array (bounds sumFieldLengths) - [ (i, n - DocTermIds.fieldLength oldDoc i - + DocTermIds.fieldLength newDoc i) - | (i, n) <- assocs sumFieldLengths ] - } -updateCachedFieldLengths (Just oldDoc) Nothing se@SearchEngine{sumFieldLengths} = - se { - sumFieldLengths = - array (bounds sumFieldLengths) - [ (i, n - DocTermIds.fieldLength oldDoc i) - | (i, n) <- assocs sumFieldLengths ] - } -updateCachedFieldLengths Nothing Nothing se = se - -insertDocs :: (Ord key, Ix field, Bounded field, Ix feature, Bounded feature) => - [doc] -> - SearchEngine doc key field feature -> - SearchEngine doc key field feature -insertDocs docs se = foldl' (\se' doc -> insertDoc doc se') se docs - -insertDoc :: (Ord key, Ix field, Bounded field, Ix feature, Bounded feature) => - doc -> - SearchEngine doc key field feature -> - SearchEngine doc key field feature -insertDoc doc se@SearchEngine{ searchConfig = SearchConfig { - documentKey, - extractDocumentTerms, - documentFeatureValue - } - , searchIndex } = - let key = documentKey doc - searchIndex' = SI.insertDoc key (extractDocumentTerms doc) - (documentFeatureValue doc) - searchIndex - oldDoc = SI.lookupDocKey searchIndex key - newDoc = SI.lookupDocKey searchIndex' key - - in cacheBM25Context $ - updateCachedFieldLengths oldDoc newDoc $ - se { searchIndex = searchIndex' } - -deleteDoc :: (Ord key, Ix field, Bounded field) => - key -> - SearchEngine doc key field feature -> - SearchEngine doc key field feature -deleteDoc key se@SearchEngine{searchIndex} = - let searchIndex' = SI.deleteDoc key searchIndex - oldDoc = SI.lookupDocKey searchIndex key - - in cacheBM25Context $ - updateCachedFieldLengths oldDoc Nothing $ - se { searchIndex = searchIndex' } - -query :: (Ix field, Bounded field, Ix feature, Bounded feature, Ord key) => - SearchEngine doc key field feature -> - [Term] -> [key] -query se@SearchEngine{ searchIndex, - searchConfig = SearchConfig{transformQueryTerm, makeKey}, - searchRankParams = SearchRankParameters{..} } - terms = - - let -- Start by transforming/normalising all the query terms. - -- This can be done differently for each field we search by. - lookupTerms :: [Term] - lookupTerms = [ term' - | term <- terms - , let transformForField = transformQueryTerm term - , term' <- nub [ transformForField field - | field <- range (minBound, maxBound) ] - ] - - -- Then we look up all the normalised terms in the index. - rawresults :: [Maybe (TermId, DocIdSet)] - rawresults = map (SI.lookupTerm searchIndex) lookupTerms - - -- Check if there is one term then it exactly matches a package - exactMatch :: Maybe DocId - exactMatch = case terms of - [] -> Nothing - [x] -> SI.lookupDocKeyReal searchIndex (makeKey x) - (_:_) -> Nothing - - -- For the terms that occur in the index, this gives us the term's id - -- and the set of documents that the term occurs in. - termids :: [TermId] - docidsets :: [DocIdSet] - (termids, docidsets) = unzip (catMaybes rawresults) - - -- We looked up the documents that *any* of the term occur in (not all) - -- so this could be rather a lot of docs if the user uses a few common - -- terms. Scoring these result docs is a non-trivial cost so we want to - -- limit the number that we have to score. The standard trick is to - -- consider the doc sets in the order of size, smallest to biggest. Once - -- we have gone over a certain threshold of docs then don't bother with - -- the doc sets for the remaining terms. This tends to work because the - -- scoring gives lower weight to terms that occur in many documents. - unrankedResults :: DocIdSet - unrankedResults = pruneRelevantResults - paramResultsetSoftLimit - paramResultsetHardLimit - docidsets - - --TODO: technically this isn't quite correct. Because each field can - -- be normalised differently, we can end up with different termids for - -- the same original search term, and then we score those as if they - -- were different terms, which makes a difference when the term appears - -- in multiple fields (exactly the case BM25F is supposed to deal with). - -- What we ought to have instead is an Array (Int, field) TermId, and - -- make the scoring use the appropriate termid for each field, but to - -- consider them the "same" term. - in rankResults se exactMatch termids (DocIdSet.toList unrankedResults) - -rankResults :: forall field key feature doc . - (Ix field, Bounded field, Ix feature, Bounded feature) => - SearchEngine doc key field feature -> Maybe DocId -> - [TermId] -> [DocId] -> [key] -rankResults se@SearchEngine{searchIndex} exactMatch queryTerms docids = - maybe id prependExactMatch exactMatch (map snd - $ sortBy (flip compare `on` fst) - [ (relevanceScore se queryTerms doctermids docfeatvals, dockey) - | docid <- docids - , maybe True (/= docid) exactMatch - , let (dockey, doctermids, docfeatvals) = SI.lookupDocId searchIndex docid ]) - where - prependExactMatch :: DocId -> [key] -> [key] - prependExactMatch docid keys = SI.lookupDocId' searchIndex docid : keys - -relevanceScore :: (Ix field, Bounded field, Ix feature, Bounded feature) => - SearchEngine doc key field feature -> - [TermId] -> DocTermIds field -> DocFeatVals feature -> Float -relevanceScore SearchEngine{bm25Context} queryTerms doctermids docfeatvals = - BM25F.score bm25Context doc queryTerms - where - doc = indexDocToBM25Doc doctermids docfeatvals - -indexDocToBM25Doc :: (Ix field, Bounded field, Ix feature, Bounded feature) => - DocTermIds field -> - DocFeatVals feature -> - BM25F.Doc TermId field feature -indexDocToBM25Doc doctermids docfeatvals = - BM25F.Doc { - BM25F.docFieldLength = DocTermIds.fieldLength doctermids, - BM25F.docFieldTermFrequency = DocTermIds.fieldTermCount doctermids, - BM25F.docFeatureValue = DocFeatVals.featureValue docfeatvals - } - -pruneRelevantResults :: Int -> Int -> [DocIdSet] -> DocIdSet -pruneRelevantResults softLimit hardLimit = - -- Look at the docsets starting with the smallest ones. Smaller docsets - -- correspond to the rarer terms, which are the ones that score most highly. - go DocIdSet.empty . sortBy (compare `on` DocIdSet.size) - where - go !acc [] = acc - go !acc (d:ds) - -- If this is the first one, we add it anyway, otherwise we're in - -- danger of returning no results at all. - | DocIdSet.null acc = go d ds - -- We consider the size our docset would be if we add this extra one... - -- If it puts us over the hard limit then stop. - | size > hardLimit = acc - -- If it puts us over soft limit then we add it and stop - | size > softLimit = DocIdSet.union acc d - -- Otherwise we can add it and carry on to consider the remainder - | otherwise = go (DocIdSet.union acc d) ds - where - size = DocIdSet.size acc + DocIdSet.size d - ------------------------------ - -queryExplain :: (Ix field, Bounded field, Ix feature, Bounded feature, Ord key) => - SearchEngine doc key field feature -> - [Term] -> (Maybe key, [(BM25F.Explanation field feature Term, key)]) -queryExplain se@SearchEngine{ searchIndex, - searchConfig = SearchConfig{transformQueryTerm, makeKey}, - searchRankParams = SearchRankParameters{..} } - terms = - - -- See 'query' above for explanation. Really we ought to combine them. - let lookupTerms :: [Term] - lookupTerms = [ term' - | term <- terms - , let transformForField = transformQueryTerm term - , term' <- nub [ transformForField field - | field <- range (minBound, maxBound) ] - ] - - exactMatch :: Maybe DocId - exactMatch = case terms of - [] -> Nothing - [x] -> SI.lookupDocKeyReal searchIndex (makeKey x) - (_:_) -> Nothing - - rawresults :: [Maybe (TermId, DocIdSet)] - rawresults = map (SI.lookupTerm searchIndex) lookupTerms - - termids :: [TermId] - docidsets :: [DocIdSet] - (termids, docidsets) = unzip (catMaybes rawresults) - - unrankedResults :: DocIdSet - unrankedResults = pruneRelevantResults - paramResultsetSoftLimit - paramResultsetHardLimit - docidsets - - in ( fmap (SI.lookupDocId' searchIndex) exactMatch - , rankExplainResults se termids (DocIdSet.toList unrankedResults) - ) - -rankExplainResults :: (Ix field, Bounded field, Ix feature, Bounded feature) => - SearchEngine doc key field feature -> - [TermId] -> - [DocId] -> - [(BM25F.Explanation field feature Term, key)] -rankExplainResults se@SearchEngine{searchIndex} queryTerms docids = - sortBy (flip compare `on` (BM25F.overallScore . fst)) - [ (explainRelevanceScore se queryTerms doctermids docfeatvals, dockey) - | docid <- docids - , let (dockey, doctermids, docfeatvals) = SI.lookupDocId searchIndex docid ] - -explainRelevanceScore :: (Ix field, Bounded field, Ix feature, Bounded feature) => - SearchEngine doc key field feature -> - [TermId] -> - DocTermIds field -> - DocFeatVals feature -> - BM25F.Explanation field feature Term -explainRelevanceScore SearchEngine{bm25Context, searchIndex} - queryTerms doctermids docfeatvals = - fmap (SI.getTerm searchIndex) (BM25F.explain bm25Context doc queryTerms) - where - doc = indexDocToBM25Doc doctermids docfeatvals - ------------------------------ - -data NoFeatures = NoFeatures - deriving (Eq, Ord, Bounded) - -instance Ix NoFeatures where - range _ = [] - inRange _ _ = False - index _ _ = -1 - -noFeatures :: NoFeatures -> a -noFeatures _ = error "noFeatures" - ------------------------------ - -instance MemSize key => MemSize (SearchEngine doc key field feature) where - memSize SearchEngine {searchIndex} = 25 + memSize searchIndex diff --git a/Distribution/Server/Features/Search/SearchIndex.hs b/Distribution/Server/Features/Search/SearchIndex.hs deleted file mode 100644 index cfb2291e9..000000000 --- a/Distribution/Server/Features/Search/SearchIndex.hs +++ /dev/null @@ -1,414 +0,0 @@ -{-# LANGUAGE BangPatterns, NamedFieldPuns #-} - -module Distribution.Server.Features.Search.SearchIndex ( - SearchIndex, - Term, - TermId, - DocId, - - emptySearchIndex, - insertDoc, - deleteDoc, - - docCount, - lookupTerm, - lookupTermId, - lookupDocId, - lookupDocId', - lookupDocKey, - lookupDocKeyReal, - - getTerm, - - invariant, - ) where - -import Distribution.Server.Features.Search.DocIdSet (DocIdSet, DocId) -import qualified Distribution.Server.Features.Search.DocIdSet as DocIdSet -import Distribution.Server.Features.Search.DocTermIds (DocTermIds, TermId, vecIndexIx, vecCreateIx) -import qualified Distribution.Server.Features.Search.DocTermIds as DocTermIds -import Distribution.Server.Features.Search.DocFeatVals (DocFeatVals) -import qualified Distribution.Server.Features.Search.DocFeatVals as DocFeatVals - -import Distribution.Server.Framework.MemSize - -import Data.Ix (Ix) -import qualified Data.Ix as Ix -import Data.Map (Map) -import qualified Data.Map as Map -import Data.IntMap (IntMap) -import qualified Data.IntMap as IntMap -import qualified Data.Set as Set -import Data.Text (Text) -import Data.List (foldl') - -import Control.Exception (assert) - -type Term = Text - --- | The search index is essentially a many-to-many mapping between documents --- and terms. Each document contains many terms and each term occurs in many --- documents. It is a bidirectional mapping as we need to support lookups in --- both directions. --- --- Documents are identified by a key (in Ord) while terms are text values. --- Inside the index however we assign compact numeric ids to both documents and --- terms. The advantage of this is a much more compact in-memory representation --- and the disadvantage is greater complexity. In particular it means we have --- to manage bidirectional mappings between document keys and ids, and between --- terms and term ids. --- --- So the mappings we maintain can be depicted as: --- --- > Term <-- 1:1 --> TermId --- > ^ --- > | --- > many:many --- > | --- > v --- > DocKey <-- 1:1 --> DocId --- --- For efficiency, these details are exposed in the interface. In particular --- the mapping from TermId to many DocIds is exposed via a 'DocIdSet', --- and the mapping from DocIds to TermIds is exposed via 'DocTermIds'. --- -data SearchIndex key field feature = SearchIndex { - -- the indexes - termMap :: !(Map Term TermInfo), - termIdMap :: !(IntMap Term), - docIdMap :: !(IntMap (DocInfo key field feature)), - docKeyMap :: !(Map key DocId), - - -- auto-increment key counters - nextTermId :: TermId, - nextDocId :: DocId - } - deriving Show - -data TermInfo = TermInfo !TermId !DocIdSet - deriving Show - -data DocInfo key field feature = DocInfo !key !(DocTermIds field) - !(DocFeatVals feature) - deriving Show - - ------------------------ --- SearchIndex basics --- - -emptySearchIndex :: SearchIndex key field feature -emptySearchIndex = - SearchIndex - Map.empty - IntMap.empty - IntMap.empty - Map.empty - minBound - minBound - -checkInvariant :: (Ord key, Ix field, Bounded field) => - SearchIndex key field feature -> SearchIndex key field feature -checkInvariant si = assert (invariant si) si - -invariant :: (Ord key, Ix field, Bounded field) => - SearchIndex key field feature -> Bool -invariant SearchIndex{termMap, termIdMap, docKeyMap, docIdMap} = - and [ IntMap.lookup (fromEnum termId) termIdMap == Just term - | (term, (TermInfo termId _)) <- Map.assocs termMap ] - && and [ case Map.lookup term termMap of - Just (TermInfo termId' _) -> toEnum termId == termId' - Nothing -> False - | (termId, term) <- IntMap.assocs termIdMap ] - && and [ case IntMap.lookup (fromEnum docId) docIdMap of - Just (DocInfo docKey' _ _) -> docKey == docKey' - Nothing -> False - | (docKey, docId) <- Map.assocs docKeyMap ] - && and [ Map.lookup docKey docKeyMap == Just (toEnum docId) - | (docId, DocInfo docKey _ _) <- IntMap.assocs docIdMap ] - && and [ DocIdSet.invariant docIdSet - | (_term, (TermInfo _ docIdSet)) <- Map.assocs termMap ] - && and [ any (\field -> DocTermIds.fieldTermCount docterms field termId > 0) fields - | (_term, (TermInfo termId docIdSet)) <- Map.assocs termMap - , docId <- DocIdSet.toList docIdSet - , let DocInfo _ docterms _ = docIdMap IntMap.! fromEnum docId ] - && and [ IntMap.member (fromEnum termid) termIdMap - | (_docId, DocInfo _ docTerms _) <- IntMap.assocs docIdMap - , field <- fields - , termid <- DocTermIds.fieldElems docTerms field ] - where - fields = Ix.range (minBound, maxBound) - - -------------------- --- Lookups --- - -docCount :: SearchIndex key field feature -> Int -docCount SearchIndex{docIdMap} = IntMap.size docIdMap - -lookupTerm :: SearchIndex key field feature -> Term -> Maybe (TermId, DocIdSet) -lookupTerm SearchIndex{termMap} term = - case Map.lookup term termMap of - Nothing -> Nothing - Just (TermInfo termid docidset) -> Just (termid, docidset) - -lookupTermId :: SearchIndex key field feature -> TermId -> DocIdSet -lookupTermId SearchIndex{termIdMap, termMap} termid = - case IntMap.lookup (fromEnum termid) termIdMap of - Nothing -> error $ "lookupTermId: not found " ++ show termid - Just term -> - case Map.lookup term termMap of - Nothing -> error "lookupTermId: internal error" - Just (TermInfo _ docidset) -> docidset - -lookupDocId :: SearchIndex key field feature -> - DocId -> (key, DocTermIds field, DocFeatVals feature) -lookupDocId SearchIndex{docIdMap} docid = - case IntMap.lookup (fromEnum docid) docIdMap of - Nothing -> errNotFound - Just (DocInfo key doctermids docfeatvals) -> (key, doctermids, docfeatvals) - where - errNotFound = error $ "lookupDocId: not found " ++ show docid - -lookupDocId' :: SearchIndex key field feature -> DocId -> key -lookupDocId' searchIndex docId = dockey - where - (dockey, _, _) = lookupDocId searchIndex docId - -lookupDocKey :: Ord key => SearchIndex key field feature -> key -> Maybe (DocTermIds field) -lookupDocKey SearchIndex{docKeyMap, docIdMap} key = - case Map.lookup key docKeyMap of - Nothing -> Nothing - Just docid -> - case IntMap.lookup (fromEnum docid) docIdMap of - Nothing -> error "lookupDocKey: internal error" - Just (DocInfo _ doctermids _) -> Just doctermids - -lookupDocKeyReal :: Ord key => SearchIndex key field feature -> key -> Maybe DocId -lookupDocKeyReal SearchIndex{docKeyMap} key = Map.lookup key docKeyMap - -getTerm :: SearchIndex key field feature -> TermId -> Term -getTerm SearchIndex{termIdMap} termId = - termIdMap IntMap.! fromEnum termId - -getTermId :: SearchIndex key field feature -> Term -> TermId -getTermId SearchIndex{termMap} term = - case termMap Map.! term of TermInfo termid _ -> termid - -getDocTermIds :: SearchIndex key field feature -> DocId -> DocTermIds field -getDocTermIds SearchIndex{docIdMap} docid = - case docIdMap IntMap.! fromEnum docid of - DocInfo _ doctermids _ -> doctermids - --------------------- --- Insert & delete --- - --- Procedure for adding a new doc... --- (key, field -> [Term]) --- alloc docid for key --- add term occurrences for docid (include rev map for termid) --- construct indexdoc now that we have all the term -> termid entries --- insert indexdoc - --- Procedure for updating a doc... --- (key, field -> [Term]) --- find docid for key --- lookup old terms for docid (using termid rev map) --- calc term occurrences to add, term occurrences to delete --- add new term occurrences, delete old term occurrences --- construct indexdoc now that we have all the term -> termid entries --- insert indexdoc - --- Procedure for deleting a doc... --- (key, field -> [Term]) --- find docid for key --- lookup old terms for docid (using termid rev map) --- delete old term occurrences --- delete indexdoc - --- | This is the representation for documents to be added to the index. --- Documents may --- -type DocTerms field = field -> [Term] -type DocFeatureValues feature = feature -> Float - -insertDoc :: (Ord key, Ix field, Bounded field, Ix feature, Bounded feature) => - key -> DocTerms field -> DocFeatureValues feature -> - SearchIndex key field feature -> SearchIndex key field feature -insertDoc key userDocTerms userDocFeats si@SearchIndex{docKeyMap} - | Just docid <- Map.lookup key docKeyMap - = -- Some older version of the doc is already present in the index, - -- So we keep its docid. Now have to update the doc itself - -- and update the terms by removing old ones and adding new ones. - let oldTermsIds = getDocTermIds si docid - userDocTerms' = memoiseDocTerms userDocTerms - newTerms = docTermSet userDocTerms' - oldTerms = docTermIdsTermSet si oldTermsIds - -- We optimise for the typical case of significant overlap between - -- the terms in the old and new versions of the document. - delTerms = oldTerms `Set.difference` newTerms - addTerms = newTerms `Set.difference` oldTerms - - -- Note: adding the doc relies on all the terms being in the termMap - -- already, so we first add all the term occurrences for the docid. - in checkInvariant - . insertDocIdToDocEntry docid key userDocTerms' userDocFeats - . insertTermToDocIdEntries (Set.toList addTerms) docid - . deleteTermToDocIdEntries (Set.toList delTerms) docid - $ si - - | otherwise - = -- We're dealing with a new doc, so allocate a docid for the key - let (si', docid) = allocFreshDocId si - userDocTerms' = memoiseDocTerms userDocTerms - addTerms = docTermSet userDocTerms' - - -- Note: adding the doc relies on all the terms being in the termMap - -- already, so we first add all the term occurrences for the docid. - in checkInvariant - . insertDocIdToDocEntry docid key userDocTerms' userDocFeats - . insertDocKeyToIdEntry key docid - . insertTermToDocIdEntries (Set.toList addTerms) docid - $ si' - -deleteDoc :: (Ord key, Ix field, Bounded field) => - key -> - SearchIndex key field feature -> SearchIndex key field feature -deleteDoc key si@SearchIndex{docKeyMap} - | Just docid <- Map.lookup key docKeyMap - = let oldTermsIds = getDocTermIds si docid - oldTerms = docTermIdsTermSet si oldTermsIds - in checkInvariant - . deleteDocEntry docid key - . deleteTermToDocIdEntries (Set.toList oldTerms) docid - $ si - - | otherwise = si - - ----------------------------------- --- Insert & delete support utils --- - - -memoiseDocTerms :: (Ix field, Bounded field) => DocTerms field -> DocTerms field -memoiseDocTerms docTermsFn = - \field -> vecIndexIx vec field - where - vec = vecCreateIx docTermsFn - -docTermSet :: (Bounded t, Ix t) => DocTerms t -> Set.Set Term -docTermSet docterms = - Set.unions [ Set.fromList (docterms field) - | field <- Ix.range (minBound, maxBound) ] - -docTermIdsTermSet :: (Bounded field, Ix field) => - SearchIndex key field feature -> - DocTermIds field -> Set.Set Term -docTermIdsTermSet si doctermids = - Set.unions [ Set.fromList terms - | field <- Ix.range (minBound, maxBound) - , let termids = DocTermIds.fieldElems doctermids field - terms = map (getTerm si) termids ] - --- --- The Term <-> DocId mapping --- - --- | Add an entry into the 'Term' to 'DocId' mapping. -insertTermToDocIdEntry :: Term -> DocId -> - SearchIndex key field feature -> - SearchIndex key field feature -insertTermToDocIdEntry term !docid si@SearchIndex{termMap, termIdMap, nextTermId} = - case Map.lookup term termMap of - Nothing -> - let !termInfo' = TermInfo nextTermId (DocIdSet.singleton docid) - in si { termMap = Map.insert term termInfo' termMap - , termIdMap = IntMap.insert (fromEnum nextTermId) term termIdMap - , nextTermId = succ nextTermId } - - Just (TermInfo termId docIdSet) -> - let !termInfo' = TermInfo termId (DocIdSet.insert docid docIdSet) - in si { termMap = Map.insert term termInfo' termMap } - --- | Add multiple entries into the 'Term' to 'DocId' mapping: many terms that --- map to the same document. -insertTermToDocIdEntries :: [Term] -> DocId -> - SearchIndex key field feature -> - SearchIndex key field feature -insertTermToDocIdEntries terms !docid si = - foldl' (\si' term -> insertTermToDocIdEntry term docid si') si terms - --- | Delete an entry from the 'Term' to 'DocId' mapping. -deleteTermToDocIdEntry :: Term -> DocId -> - SearchIndex key field feature -> - SearchIndex key field feature -deleteTermToDocIdEntry term !docid si@SearchIndex{termMap, termIdMap} = - case Map.lookup term termMap of - Nothing -> si - Just (TermInfo termId docIdSet) -> - let docIdSet' = DocIdSet.delete docid docIdSet - termInfo' = TermInfo termId docIdSet' - in if DocIdSet.null docIdSet' - then si { termMap = Map.delete term termMap - , termIdMap = IntMap.delete (fromEnum termId) termIdMap } - else si { termMap = Map.insert term termInfo' termMap } - --- | Delete multiple entries from the 'Term' to 'DocId' mapping: many terms --- that map to the same document. -deleteTermToDocIdEntries :: [Term] -> DocId -> - SearchIndex key field feature -> - SearchIndex key field feature -deleteTermToDocIdEntries terms !docid si = - foldl' (\si' term -> deleteTermToDocIdEntry term docid si') si terms - --- --- The DocId <-> Doc mapping --- - -allocFreshDocId :: SearchIndex key field feature -> - (SearchIndex key field feature, DocId) -allocFreshDocId si@SearchIndex{nextDocId} = - let !si' = si { nextDocId = succ nextDocId } - in (si', nextDocId) - -insertDocKeyToIdEntry :: Ord key => key -> DocId -> - SearchIndex key field feature -> - SearchIndex key field feature -insertDocKeyToIdEntry dockey !docid si@SearchIndex{docKeyMap} = - si { docKeyMap = Map.insert dockey docid docKeyMap } - -insertDocIdToDocEntry :: (Ix field, Bounded field, - Ix feature, Bounded feature) => - DocId -> key -> - DocTerms field -> - DocFeatureValues feature -> - SearchIndex key field feature -> - SearchIndex key field feature -insertDocIdToDocEntry !docid dockey userdocterms userdocfeats - si@SearchIndex{docIdMap} = - let doctermids = DocTermIds.create (map (getTermId si) . userdocterms) - docfeatvals= DocFeatVals.create userdocfeats - !docinfo = DocInfo dockey doctermids docfeatvals - in si { docIdMap = IntMap.insert (fromEnum docid) docinfo docIdMap } - -deleteDocEntry :: Ord key => DocId -> key -> - SearchIndex key field feature -> SearchIndex key field feature -deleteDocEntry docid key si@SearchIndex{docIdMap, docKeyMap} = - si { docIdMap = IntMap.delete (fromEnum docid) docIdMap - , docKeyMap = Map.delete key docKeyMap } - - ----------------------- --- MemSize instances - -instance MemSize key => MemSize (SearchIndex key field feature) where - memSize (SearchIndex a b c d e f) = memSize6 a b c d e f - -instance MemSize TermInfo where - memSize (TermInfo a b) = memSize2 a b - -instance MemSize key => MemSize (DocInfo key field feature) where - memSize (DocInfo a b c) = memSize3 a b c diff --git a/Distribution/Server/Features/Search/TermBag.hs b/Distribution/Server/Features/Search/TermBag.hs deleted file mode 100644 index 325e739b9..000000000 --- a/Distribution/Server/Features/Search/TermBag.hs +++ /dev/null @@ -1,76 +0,0 @@ -{-# LANGUAGE BangPatterns, GeneralizedNewtypeDeriving #-} -module Distribution.Server.Features.Search.TermBag ( - TermId, - TermBag, - size, - fromList, - elems, - termCount, - ) where - -import Distribution.Server.Framework.MemSize - -import qualified Data.Vector.Unboxed as Vec -import qualified Data.Map as Map -import Data.Word (Word32) -import Data.Bits - -newtype TermId = TermId Word32 - deriving (Eq, Ord, Show, Enum, MemSize) - -instance Bounded TermId where - minBound = TermId 0 - maxBound = TermId 0x00FFFFFF - -data TermBag = TermBag !Int !(Vec.Vector TermIdAndCount) - deriving Show - --- We sneakily stuff both the TermId and the bag count into one 32bit word -type TermIdAndCount = Word32 - --- Bottom 24 bits is the TermId, top 8 bits is the bag count -termIdAndCount :: TermId -> Int -> TermIdAndCount -termIdAndCount (TermId termid) freq = - (min (fromIntegral freq) 255 `shiftL` 24) - .|. (termid .&. 0x00FFFFFF) - -getTermId :: TermIdAndCount -> TermId -getTermId word = TermId (word .&. 0x00FFFFFF) - -getTermCount :: TermIdAndCount -> Int -getTermCount word = fromIntegral (word `shiftR` 24) - - -size :: TermBag -> Int -size (TermBag sz _) = sz - -elems :: TermBag -> [TermId] -elems (TermBag _ vec) = map getTermId (Vec.toList vec) - -termCount :: TermBag -> TermId -> Int -termCount (TermBag _ vec) = - binarySearch 0 (Vec.length vec - 1) - where - binarySearch :: Int -> Int -> TermId -> Int - binarySearch !a !b !key - | a > b = 0 - | otherwise = - let mid = (a + b) `div` 2 - tidAndCount = vec Vec.! mid - in case compare key (getTermId tidAndCount) of - LT -> binarySearch a (mid-1) key - EQ -> getTermCount tidAndCount - GT -> binarySearch (mid+1) b key - -fromList :: [TermId] -> TermBag -fromList termids = - let bag = Map.fromListWith (+) [ (t, 1) | t <- termids ] - sz = Map.foldl' (+) 0 bag - vec = Vec.fromListN (Map.size bag) - [ termIdAndCount termid freq - | (termid, freq) <- Map.toAscList bag ] - in TermBag sz vec - -instance MemSize TermBag where - memSize (TermBag _ vec) = 2 + memSizeUVector 2 vec - diff --git a/Distribution/Server/Framework/MemSize.hs b/Distribution/Server/Framework/MemSize.hs index b1cc3371f..128f3e16d 100644 --- a/Distribution/Server/Framework/MemSize.hs +++ b/Distribution/Server/Framework/MemSize.hs @@ -28,6 +28,7 @@ import qualified Data.Array.Unboxed as A import qualified Data.Vector as V import qualified Data.Vector.Unboxed as V.U import qualified Data.Version as Ver +import qualified Data.SearchEngine as SE import Distribution.Package (PackageIdentifier(..), PackageName, unPackageName) import Distribution.PackageDescription (FlagName, unFlagName) @@ -269,3 +270,9 @@ instance MemSize CompilerFlavor where instance MemSize CompilerId where memSize (CompilerId a b) = memSize2 a b + + +instance MemSize key => MemSize (SE.SearchEngine doc key field feature) where + -- TODO: what can we do about MemSize for SearchEngine? + -- memSize SE.SearchEngine {searchIndex} = 25 + memSize searchIndex + memSize _ = memSize0 diff --git a/hackage-server.cabal b/hackage-server.cabal index 88f593a18..22f428457 100644 --- a/hackage-server.cabal +++ b/hackage-server.cabal @@ -310,16 +310,9 @@ library lib-server Distribution.Server.Features.HaskellPlatform Distribution.Server.Features.HaskellPlatform.State Distribution.Server.Features.Search - Distribution.Server.Features.Search.BM25F - Distribution.Server.Features.Search.DocIdSet - Distribution.Server.Features.Search.DocTermIds - Distribution.Server.Features.Search.DocFeatVals Distribution.Server.Features.Search.ExtractDescriptionTerms Distribution.Server.Features.Search.ExtractNameTerms Distribution.Server.Features.Search.PkgSearch - Distribution.Server.Features.Search.SearchEngine - Distribution.Server.Features.Search.SearchIndex - Distribution.Server.Features.Search.TermBag Distribution.Server.Features.Sitemap.Functions Distribution.Server.Features.Votes Distribution.Server.Features.Votes.State @@ -369,6 +362,7 @@ library lib-server , cryptohash-sha256 ^>= 0.11.100 , csv ^>= 0.1 , ed25519 ^>= 0.0.5 + , full-text-search ^>= 0.2.1.1 , hackage-security ^>= 0.6 , hackage-security-HTTP ^>= 0.1.1 , haddock-library > 1.7 && < 2