From 9fe9494f3a5e471df84c21894929319b1328cd30 Mon Sep 17 00:00:00 2001 From: Alias Qli <2576814881@qq.com> Date: Wed, 13 Jul 2022 00:08:00 +0800 Subject: [PATCH 1/2] Divide sitemap into parts --- src/Distribution/Server/Features/Sitemap.hs | 66 ++++++++++++++----- .../Server/Features/Sitemap/Functions.hs | 17 +++++ 2 files changed, 67 insertions(+), 16 deletions(-) diff --git a/src/Distribution/Server/Features/Sitemap.hs b/src/Distribution/Server/Features/Sitemap.hs index 198452c39..429adc67b 100644 --- a/src/Distribution/Server/Features/Sitemap.hs +++ b/src/Distribution/Server/Features/Sitemap.hs @@ -25,7 +25,21 @@ import Data.ByteString.Lazy (ByteString) import Data.Time.Clock (UTCTime(..), getCurrentTime) import Data.Time.Calendar (showGregorian) import Network.URI +import Control.DeepSeq +import Text.Read +import Data.List.Split +data Sitemap + = Sitemap + { sitemapIndex :: XMLResponse + , sitemaps :: [XMLResponse] + } + +instance NFData Sitemap where + rnf (Sitemap i s) = rnf i `seq` rnf s + +instance MemSize Sitemap where + memSize (Sitemap i s) = memSize2 i s data SitemapFeature = SitemapFeature { sitemapFeatureInterface :: HackageFeature @@ -67,8 +81,8 @@ sitemapFeature :: ServerEnv -> DocumentationFeature -> TagsFeature -> UTCTime - -> AsyncCache XMLResponse - -> (SitemapFeature, IO XMLResponse) + -> AsyncCache Sitemap + -> (SitemapFeature, IO Sitemap) sitemapFeature ServerEnv{..} CoreFeature{..} DocumentationFeature{..} @@ -79,50 +93,70 @@ sitemapFeature ServerEnv{..} where sitemapFeatureInterface = (emptyHackageFeature "sitemap") { - featureResources = [ xmlSitemapResource ] + featureResources = [ xmlSitemapIndexResource, xmlSitemapResource ] , featureState = [] - , featureDesc = "Provides a sitemap.xml for search engines" + , featureDesc = "Provides sitemap for search engines" , featureCaches = [ CacheComponent { - cacheDesc = "sitemap.xml", + cacheDesc = "sitemap", getCacheMemSize = memSize <$> readAsyncCache sitemapCache } ] , featurePostInit = do syncAsyncCache sitemapCache addCronJob serverCron CronJob { - cronJobName = "regenerate the cached sitemap.xml", + cronJobName = "regenerate the cached sitemap", cronJobFrequency = DailyJobFrequency, cronJobOneShot = False, cronJobAction = prodAsyncCache sitemapCache "cron" } } + xmlSitemapIndexResource :: Resource + xmlSitemapIndexResource = (resourceAt "/sitemap_index.xml") { + resourceDesc = [(GET, "The dynamically generated sitemap index, in XML format")] + , resourceGet = [("xml", serveSitemapIndex)] + } + xmlSitemapResource :: Resource - xmlSitemapResource = (resourceAt "/sitemap.xml") { + xmlSitemapResource = (resourceAt "/sitemap/:filename") { resourceDesc = [(GET, "The dynamically generated sitemap, in XML format")] , resourceGet = [("xml", serveSitemap)] } - serveSitemap :: DynamicPath -> ServerPartE Response - serveSitemap _ = do - sitemapXML <- liftIO $ readAsyncCache sitemapCache + serveSitemapIndex :: DynamicPath -> ServerPartE Response + serveSitemapIndex _ = do + Sitemap{..} <- liftIO $ readAsyncCache sitemapCache cacheControlWithoutETag [Public, maxAgeDays 1] - return (toResponse sitemapXML) + return (toResponse sitemapIndex) + + serveSitemap :: DynamicPath -> ServerPartE Response + serveSitemap dpath = + case lookup "filename" dpath of + Just filename + | [basename, "xml"] <- splitOn "." filename + , Just i <- readMaybe basename -> do + Sitemap{..} <- liftIO $ readAsyncCache sitemapCache + guard (i < length sitemaps) + cacheControlWithoutETag [Public, maxAgeDays 1] + return (toResponse (sitemaps !! i)) + _ -> mzero -- Generates a list of sitemap entries corresponding to hackage pages, then -- builds and returns an XML sitemap. - updateSitemapCache :: IO XMLResponse + updateSitemapCache :: IO Sitemap updateSitemapCache = do alltags <- queryGetTagList pkgIndex <- queryGetPackageIndex docIndex <- queryDocumentationIndex - let sitemap = generateSitemap serverBaseURI pageBuildDate + let sitemaps = generateSitemap serverBaseURI pageBuildDate (map fst alltags) pkgIndex docIndex - return (XMLResponse sitemap) + uriScheme i = "/sitemap/" <> show i <> ".xml" + sitemapIndex = renderSitemapIndex serverBaseURI (map uriScheme [0..(length sitemaps - 1)]) + return $ Sitemap (XMLResponse sitemapIndex) (map XMLResponse sitemaps) pageBuildDate :: T.Text pageBuildDate = T.pack (showGregorian (utctDay initTime)) @@ -132,9 +166,9 @@ generateSitemap :: URI -> [Tag] -> PackageIndex.PackageIndex PkgInfo -> Map.Map PackageId a - -> ByteString + -> [ByteString] generateSitemap serverBaseURI pageBuildDate alltags pkgIndex docIndex = - renderSitemap serverBaseURI allEntries + renderSitemap serverBaseURI <$> chunksOf 50000 allEntries where -- Combine and build sitemap allEntries = miscEntries diff --git a/src/Distribution/Server/Features/Sitemap/Functions.hs b/src/Distribution/Server/Features/Sitemap/Functions.hs index c13208c15..7eae7f0ec 100644 --- a/src/Distribution/Server/Features/Sitemap/Functions.hs +++ b/src/Distribution/Server/Features/Sitemap/Functions.hs @@ -23,6 +23,7 @@ module Distribution.Server.Features.Sitemap.Functions ( SitemapEntry , ChangeFreq(..) + , renderSitemapIndex , renderSitemap , urlsToSitemapEntries , pathsAndDatesToSitemapEntries @@ -47,6 +48,22 @@ data SitemapEntry = SitemapEntry { data ChangeFreq = Monthly | Weekly | Daily +-- | Generate a sitemap index file from each sitemap uri. +renderSitemapIndex :: URI -> [String] -> ByteString +renderSitemapIndex serverBaseURI sitemaps = + xrender $ + doc defaultDocInfo $ + xelem "sitemapindex" $ + xattr "xmlns" "http://www.sitemaps.org/schemas/sitemap/0.9" + <#> map renderLink sitemaps + where + serverBaseURI' = T.pack (show serverBaseURI) + renderLink :: String -> Xml Elem + renderLink uri = xelem "sitemap" $ + xelems [ + xelem "loc" (xtext (serverBaseURI' <> T.pack (uri))) + ] + -- | Primary function - generates the XML file from a list of Nodes. renderSitemap :: URI -> [SitemapEntry] -> ByteString renderSitemap serverBaseURI entries = From b9330e07d3f1d38a23da367310afbcea7a0c6d5e Mon Sep 17 00:00:00 2001 From: Alias Qli <2576814881@qq.com> Date: Sat, 16 Jul 2022 16:28:36 +0800 Subject: [PATCH 2/2] Add sitemap link for subdirectories --- src/Distribution/Server/Features.hs | 1 + src/Distribution/Server/Features/Sitemap.hs | 68 +++++++++++++++------ 2 files changed, 52 insertions(+), 17 deletions(-) diff --git a/src/Distribution/Server/Features.hs b/src/Distribution/Server/Features.hs index f8a8e362e..54c584c50 100644 --- a/src/Distribution/Server/Features.hs +++ b/src/Distribution/Server/Features.hs @@ -337,6 +337,7 @@ initHackageFeatures env@ServerEnv{serverVerbosity = verbosity} = do coreFeature documentationCoreFeature tagsFeature + tarIndexCacheFeature packageFeedFeature <- mkPackageFeedFeature coreFeature diff --git a/src/Distribution/Server/Features/Sitemap.hs b/src/Distribution/Server/Features/Sitemap.hs index 429adc67b..9227d70e7 100644 --- a/src/Distribution/Server/Features/Sitemap.hs +++ b/src/Distribution/Server/Features/Sitemap.hs @@ -1,4 +1,5 @@ {-# LANGUAGE RecordWildCards, NamedFieldPuns, RecursiveDo #-} +{-# LANGUAGE TupleSections #-} module Distribution.Server.Features.Sitemap ( SitemapFeature(..) @@ -28,6 +29,10 @@ import Network.URI import Control.DeepSeq import Text.Read import Data.List.Split +import Distribution.Server.Framework.BlobStorage +import Distribution.Server.Features.TarIndexCache +import qualified Data.TarIndex as Tar +import System.FilePath (takeExtension) data Sitemap = Sitemap @@ -52,6 +57,7 @@ initSitemapFeature :: ServerEnv -> IO ( CoreFeature -> DocumentationFeature -> TagsFeature + -> TarIndexCacheFeature -> IO SitemapFeature) initSitemapFeature env@ServerEnv{ serverCacheDelay, @@ -60,10 +66,11 @@ initSitemapFeature env@ServerEnv{ serverCacheDelay, return $ \coref@CoreFeature{..} docsCore@DocumentationFeature{..} - tagsf@TagsFeature{..} -> do + tagsf@TagsFeature{..} + tarf@TarIndexCacheFeature{..} -> do rec let (feature, updateSitemapCache) = - sitemapFeature env coref docsCore tagsf + sitemapFeature env coref docsCore tagsf tarf initTime sitemapCache sitemapCache <- newAsyncCacheNF updateSitemapCache @@ -80,6 +87,7 @@ sitemapFeature :: ServerEnv -> CoreFeature -> DocumentationFeature -> TagsFeature + -> TarIndexCacheFeature -> UTCTime -> AsyncCache Sitemap -> (SitemapFeature, IO Sitemap) @@ -87,6 +95,7 @@ sitemapFeature ServerEnv{..} CoreFeature{..} DocumentationFeature{..} TagsFeature{..} + TarIndexCacheFeature{cachedTarIndex} initTime sitemapCache = (SitemapFeature{..}, updateSitemapCache) @@ -151,10 +160,10 @@ sitemapFeature ServerEnv{..} pkgIndex <- queryGetPackageIndex docIndex <- queryDocumentationIndex - let sitemaps = generateSitemap serverBaseURI pageBuildDate + sitemaps <- generateSitemap serverBaseURI pageBuildDate (map fst alltags) - pkgIndex docIndex - uriScheme i = "/sitemap/" <> show i <> ".xml" + pkgIndex docIndex cachedTarIndex + let uriScheme i = "/sitemap/" <> show i <> ".xml" sitemapIndex = renderSitemapIndex serverBaseURI (map uriScheme [0..(length sitemaps - 1)]) return $ Sitemap (XMLResponse sitemapIndex) (map XMLResponse sitemaps) @@ -165,19 +174,21 @@ generateSitemap :: URI -> T.Text -> [Tag] -> PackageIndex.PackageIndex PkgInfo - -> Map.Map PackageId a - -> [ByteString] -generateSitemap serverBaseURI pageBuildDate alltags pkgIndex docIndex = - renderSitemap serverBaseURI <$> chunksOf 50000 allEntries + -> Map.Map PackageId BlobId + -> (BlobId -> IO Tar.TarIndex) + -> IO [ByteString] +generateSitemap serverBaseURI pageBuildDate alltags pkgIndex docIndex cachedTarIndex = do + versionedDocSubEntries <- versionedDocSubEntriesIO + let -- Combine and build sitemap + allEntries = miscEntries + ++ tagEntries + ++ nameEntries + ++ nameVersEntries + ++ baseDocEntries + ++ versionedDocEntries + ++ versionedDocSubEntries + pure $ renderSitemap serverBaseURI <$> chunksOf 50000 allEntries where - -- Combine and build sitemap - allEntries = miscEntries - ++ tagEntries - ++ nameEntries - ++ nameVersEntries - ++ baseDocEntries - ++ versionedDocEntries - -- Misc. pages -- e.g. ["http://myhackage.com/index", ...] miscEntries = urlsToSitemapEntries miscPages pageBuildDate Weekly 0.75 @@ -258,3 +269,26 @@ generateSitemap serverBaseURI pageBuildDate alltags pkgIndex docIndex = , Map.member (packageId pkg) docIndex ] pageBuildDate Monthly 0.25 + + -- Versioned doc pages in subdirectories + -- versionedSubDocURIs :: [path :: String] + -- e.g. ["http://myhackage.com/packages/mypackage-1.0.2/docs/Lib.html", ...] + versionedDocSubEntriesIO = do + let pkgs = [ (pkg , blob) + | pkg <- concat pkgss + , Just blob <- [Map.lookup (packageId pkg) docIndex] + ] + pkgIndices <- traverse (\(pkg, blob) -> (pkg,) <$> cachedTarIndex blob) pkgs + pure $ urlsToSitemapEntries + [ prefixPkgURI ++ display (packageId pkg) ++ "/docs" ++ fp + | (pkg, tarIndex) <- pkgIndices + , Just tar <- [Tar.lookup tarIndex ""] + , fp <- entryToPaths "/" tar + , takeExtension fp == ".html" + ] + pageBuildDate Monthly 0.25 + + entryToPaths :: FilePath -> Tar.TarIndexEntry -> [FilePath] + entryToPaths _ (Tar.TarFileEntry _) = [] + entryToPaths base (Tar.TarDir content) = map ((base ) . fst) content ++ + [ file | (folder, entry) <- content, file <- entryToPaths (base folder) entry ]