Skip to content

Divide sitemap into parts #1103

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Dec 31, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/Distribution/Server/Features.hs
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,7 @@ initHackageFeatures env@ServerEnv{serverVerbosity = verbosity} = do
coreFeature
documentationCoreFeature
tagsFeature
tarIndexCacheFeature

packageFeedFeature <- mkPackageFeedFeature
coreFeature
Expand Down
126 changes: 97 additions & 29 deletions src/Distribution/Server/Features/Sitemap.hs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
{-# LANGUAGE RecordWildCards, NamedFieldPuns, RecursiveDo #-}
{-# LANGUAGE TupleSections #-}

module Distribution.Server.Features.Sitemap (
SitemapFeature(..)
Expand All @@ -25,7 +26,25 @@ import Data.ByteString.Lazy (ByteString)
import Data.Time.Clock (UTCTime(..), getCurrentTime)
import Data.Time.Calendar (showGregorian)
import Network.URI
import Control.DeepSeq
import Text.Read
import Data.List.Split
import Distribution.Server.Framework.BlobStorage
import Distribution.Server.Features.TarIndexCache
import qualified Data.TarIndex as Tar
import System.FilePath (takeExtension)

data Sitemap
= Sitemap
{ sitemapIndex :: XMLResponse
, sitemaps :: [XMLResponse]
}

instance NFData Sitemap where
rnf (Sitemap i s) = rnf i `seq` rnf s

instance MemSize Sitemap where
memSize (Sitemap i s) = memSize2 i s

data SitemapFeature = SitemapFeature {
sitemapFeatureInterface :: HackageFeature
Expand All @@ -38,6 +57,7 @@ initSitemapFeature :: ServerEnv
-> IO ( CoreFeature
-> DocumentationFeature
-> TagsFeature
-> TarIndexCacheFeature
-> IO SitemapFeature)

initSitemapFeature env@ServerEnv{ serverCacheDelay,
Expand All @@ -46,10 +66,11 @@ initSitemapFeature env@ServerEnv{ serverCacheDelay,

return $ \coref@CoreFeature{..}
docsCore@DocumentationFeature{..}
tagsf@TagsFeature{..} -> do
tagsf@TagsFeature{..}
tarf@TarIndexCacheFeature{..} -> do

rec let (feature, updateSitemapCache) =
sitemapFeature env coref docsCore tagsf
sitemapFeature env coref docsCore tagsf tarf
initTime sitemapCache

sitemapCache <- newAsyncCacheNF updateSitemapCache
Expand All @@ -66,63 +87,85 @@ sitemapFeature :: ServerEnv
-> CoreFeature
-> DocumentationFeature
-> TagsFeature
-> TarIndexCacheFeature
-> UTCTime
-> AsyncCache XMLResponse
-> (SitemapFeature, IO XMLResponse)
-> AsyncCache Sitemap
-> (SitemapFeature, IO Sitemap)
sitemapFeature ServerEnv{..}
CoreFeature{..}
DocumentationFeature{..}
TagsFeature{..}
TarIndexCacheFeature{cachedTarIndex}
initTime
sitemapCache
= (SitemapFeature{..}, updateSitemapCache)
where

sitemapFeatureInterface = (emptyHackageFeature "sitemap") {
featureResources = [ xmlSitemapResource ]
featureResources = [ xmlSitemapIndexResource, xmlSitemapResource ]
, featureState = []
, featureDesc = "Provides a sitemap.xml for search engines"
, featureDesc = "Provides sitemap for search engines"
, featureCaches =
[ CacheComponent {
cacheDesc = "sitemap.xml",
cacheDesc = "sitemap",
getCacheMemSize = memSize <$> readAsyncCache sitemapCache
}
]
, featurePostInit = do
syncAsyncCache sitemapCache
addCronJob serverCron CronJob {
cronJobName = "regenerate the cached sitemap.xml",
cronJobName = "regenerate the cached sitemap",
cronJobFrequency = DailyJobFrequency,
cronJobOneShot = False,
cronJobAction = prodAsyncCache sitemapCache "cron"
}
}

xmlSitemapIndexResource :: Resource
xmlSitemapIndexResource = (resourceAt "/sitemap_index.xml") {
resourceDesc = [(GET, "The dynamically generated sitemap index, in XML format")]
, resourceGet = [("xml", serveSitemapIndex)]
}

xmlSitemapResource :: Resource
xmlSitemapResource = (resourceAt "/sitemap.xml") {
xmlSitemapResource = (resourceAt "/sitemap/:filename") {
resourceDesc = [(GET, "The dynamically generated sitemap, in XML format")]
, resourceGet = [("xml", serveSitemap)]
}

serveSitemap :: DynamicPath -> ServerPartE Response
serveSitemap _ = do
sitemapXML <- liftIO $ readAsyncCache sitemapCache
serveSitemapIndex :: DynamicPath -> ServerPartE Response
serveSitemapIndex _ = do
Sitemap{..} <- liftIO $ readAsyncCache sitemapCache
cacheControlWithoutETag [Public, maxAgeDays 1]
return (toResponse sitemapXML)
return (toResponse sitemapIndex)

serveSitemap :: DynamicPath -> ServerPartE Response
serveSitemap dpath =
case lookup "filename" dpath of
Just filename
| [basename, "xml"] <- splitOn "." filename
, Just i <- readMaybe basename -> do
Sitemap{..} <- liftIO $ readAsyncCache sitemapCache
guard (i < length sitemaps)
cacheControlWithoutETag [Public, maxAgeDays 1]
return (toResponse (sitemaps !! i))
_ -> mzero

-- Generates a list of sitemap entries corresponding to hackage pages, then
-- builds and returns an XML sitemap.
updateSitemapCache :: IO XMLResponse
updateSitemapCache :: IO Sitemap
updateSitemapCache = do

alltags <- queryGetTagList
pkgIndex <- queryGetPackageIndex
docIndex <- queryDocumentationIndex

let sitemap = generateSitemap serverBaseURI pageBuildDate
sitemaps <- generateSitemap serverBaseURI pageBuildDate
(map fst alltags)
pkgIndex docIndex
return (XMLResponse sitemap)
pkgIndex docIndex cachedTarIndex
let uriScheme i = "/sitemap/" <> show i <> ".xml"
sitemapIndex = renderSitemapIndex serverBaseURI (map uriScheme [0..(length sitemaps - 1)])
return $ Sitemap (XMLResponse sitemapIndex) (map XMLResponse sitemaps)

pageBuildDate :: T.Text
pageBuildDate = T.pack (showGregorian (utctDay initTime))
Expand All @@ -131,19 +174,21 @@ generateSitemap :: URI
-> T.Text
-> [Tag]
-> PackageIndex.PackageIndex PkgInfo
-> Map.Map PackageId a
-> ByteString
generateSitemap serverBaseURI pageBuildDate alltags pkgIndex docIndex =
renderSitemap serverBaseURI allEntries
-> Map.Map PackageId BlobId
-> (BlobId -> IO Tar.TarIndex)
-> IO [ByteString]
generateSitemap serverBaseURI pageBuildDate alltags pkgIndex docIndex cachedTarIndex = do
versionedDocSubEntries <- versionedDocSubEntriesIO
let -- Combine and build sitemap
allEntries = miscEntries
++ tagEntries
++ nameEntries
++ nameVersEntries
++ baseDocEntries
++ versionedDocEntries
++ versionedDocSubEntries
pure $ renderSitemap serverBaseURI <$> chunksOf 50000 allEntries
where
-- Combine and build sitemap
allEntries = miscEntries
++ tagEntries
++ nameEntries
++ nameVersEntries
++ baseDocEntries
++ versionedDocEntries

-- Misc. pages
-- e.g. ["http://myhackage.com/index", ...]
miscEntries = urlsToSitemapEntries miscPages pageBuildDate Weekly 0.75
Expand Down Expand Up @@ -224,3 +269,26 @@ generateSitemap serverBaseURI pageBuildDate alltags pkgIndex docIndex =
, Map.member (packageId pkg) docIndex
]
pageBuildDate Monthly 0.25

-- Versioned doc pages in subdirectories
-- versionedSubDocURIs :: [path :: String]
-- e.g. ["http://myhackage.com/packages/mypackage-1.0.2/docs/Lib.html", ...]
versionedDocSubEntriesIO = do
let pkgs = [ (pkg , blob)
| pkg <- concat pkgss
, Just blob <- [Map.lookup (packageId pkg) docIndex]
]
pkgIndices <- traverse (\(pkg, blob) -> (pkg,) <$> cachedTarIndex blob) pkgs
pure $ urlsToSitemapEntries
[ prefixPkgURI ++ display (packageId pkg) ++ "/docs" ++ fp
| (pkg, tarIndex) <- pkgIndices
, Just tar <- [Tar.lookup tarIndex ""]
, fp <- entryToPaths "/" tar
, takeExtension fp == ".html"
]
pageBuildDate Monthly 0.25

entryToPaths :: FilePath -> Tar.TarIndexEntry -> [FilePath]
entryToPaths _ (Tar.TarFileEntry _) = []
entryToPaths base (Tar.TarDir content) = map ((base </>) . fst) content ++
[ file | (folder, entry) <- content, file <- entryToPaths (base </> folder) entry ]
17 changes: 17 additions & 0 deletions src/Distribution/Server/Features/Sitemap/Functions.hs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
module Distribution.Server.Features.Sitemap.Functions (
SitemapEntry
, ChangeFreq(..)
, renderSitemapIndex
, renderSitemap
, urlsToSitemapEntries
, pathsAndDatesToSitemapEntries
Expand All @@ -47,6 +48,22 @@ data SitemapEntry = SitemapEntry {

data ChangeFreq = Monthly | Weekly | Daily

-- | Generate a sitemap index file from each sitemap uri.
renderSitemapIndex :: URI -> [String] -> ByteString
renderSitemapIndex serverBaseURI sitemaps =
xrender $
doc defaultDocInfo $
xelem "sitemapindex" $
xattr "xmlns" "http://www.sitemaps.org/schemas/sitemap/0.9"
<#> map renderLink sitemaps
where
serverBaseURI' = T.pack (show serverBaseURI)
renderLink :: String -> Xml Elem
renderLink uri = xelem "sitemap" $
xelems [
xelem "loc" (xtext (serverBaseURI' <> T.pack (uri)))
]

-- | Primary function - generates the XML file from a list of Nodes.
renderSitemap :: URI -> [SitemapEntry] -> ByteString
renderSitemap serverBaseURI entries =
Expand Down