grimoire/grimoiressg/modules/sitemaps.py

97 lines
3.5 KiB
Python

import gzip
import os
from itertools import batched
from xml.etree import ElementTree as ET
from grimoiressg.utils import to_relative, logger
INDEX_FILE_STRATEGY_NONE = "none"
INDEX_FILE_STRATEGY_AUTO = "auto"
def sitemaps_default_config():
return {
"file_prefix": "sitemap",
"loc_prefix": "https://example.com/",
"index_file_strategy": INDEX_FILE_STRATEGY_AUTO,
"compression": False,
}
def get_files_to_map(data, sitemap_config):
content_for_sitemap = filter(
lambda item: item.get("output", False) and not item.get("skip_sitemap", False),
data
)
if sitemap_config["index_file_strategy"] == INDEX_FILE_STRATEGY_AUTO:
# maximum number of entries is 50 000, however there is also a 50 MiB size limit
# -> make 20 000 item batches - to be safe
return list(batched(content_for_sitemap, 20000))
else:
return [content_for_sitemap]
def get_sitemap_file_suffix(sitemap_config):
if sitemap_config["compression"]:
return ".xml.gz"
else:
return ".xml"
def save_sitemaps_file(xml_data, name, context, sitemap_config):
xml_str = ET.tostring(xml_data, encoding='utf8')
filename = os.path.realpath(context["output_dir"] + "/" + name + get_sitemap_file_suffix(sitemap_config))
logger.debug("Writing sitemap %s", to_relative(filename))
open_function = gzip.open if sitemap_config["compression"] else open
with open_function(filename, "wb") as file:
file.write(xml_str)
def generate_index_file(context, sitemap_config, number_of_batches):
root = ET.Element("sitemapindex", attrib={
"xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance",
"xsi:schemaLocation": "http://www.sitemaps.org/schemas/sitemap/0.9 "
"http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd",
"xmlns": "http://www.sitemaps.org/schemas/sitemap/0.9"
})
for i in range(1, number_of_batches + 1):
sitemap = ET.SubElement(root, "sitemap")
loc = ET.SubElement(sitemap, "loc")
loc.text = sitemap_config["loc_prefix"] + sitemap_config["file_prefix"] + str(i) + get_sitemap_file_suffix(
sitemap_config)
save_sitemaps_file(root, sitemap_config["file_prefix"], context, sitemap_config)
def generate_sitemaps_file(batch, name, context, sitemap_config):
root = ET.Element("urlset", attrib={
"xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance",
"xsi:schemaLocation": "http://www.sitemaps.org/schemas/sitemap/0.9 "
"http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd",
"xmlns": "http://www.sitemaps.org/schemas/sitemap/0.9"
})
for entry in batch:
url = ET.SubElement(root, "url")
loc = ET.SubElement(url, "loc")
loc.text = sitemap_config["loc_prefix"] + entry["output"]
save_sitemaps_file(root, name, context, sitemap_config)
def generate_sitemaps(data, context, config):
sitemaps_config = sitemaps_default_config()
sitemaps_config.update(config.get('sitemaps', {}))
batches = get_files_to_map(data, sitemaps_config)
if len(batches) > 1:
logger.info("Entry limit exceeded; generating index file...")
generate_index_file(context, sitemaps_config, len(batches))
for i, batch in enumerate(batches):
generate_sitemaps_file(batch, sitemaps_config["file_prefix"] + str(i + 1), context, sitemaps_config)
else:
generate_sitemaps_file(batches[0], sitemaps_config["file_prefix"], context, sitemaps_config)