diff --git a/example/config.yml b/example/config.yml index 9a65a5b..f7b7bf2 100644 --- a/example/config.yml +++ b/example/config.yml @@ -5,4 +5,5 @@ enabled_modules: - tags - markdown - templating - - test \ No newline at end of file + - test + - sitemaps \ No newline at end of file diff --git a/external_module_test/__init__.py b/external_module_test/__init__.py index 5a59c42..77257c7 100644 --- a/external_module_test/__init__.py +++ b/external_module_test/__init__.py @@ -2,7 +2,7 @@ from grimoiressg.modules import available_modules from grimoiressg.utils import logger -def test(data, context): +def test(data, context, config): logger.info("This is test module.") diff --git a/grimoiressg/__main__.py b/grimoiressg/__main__.py index b8b38cb..fc6828b 100644 --- a/grimoiressg/__main__.py +++ b/grimoiressg/__main__.py @@ -10,7 +10,7 @@ from grimoiressg.utils import logger def apply_modules(data, config, context): for module in config.get("enabled_modules", []): logger.info("Applying module %s...", module) - available_modules[module](data, context) + available_modules[module](data, context, config) def main(): diff --git a/grimoiressg/modules/__init__.py b/grimoiressg/modules/__init__.py index 133e55f..c366a1c 100644 --- a/grimoiressg/modules/__init__.py +++ b/grimoiressg/modules/__init__.py @@ -1,11 +1,13 @@ from grimoiressg.modules.markdown import compile_markdown +from grimoiressg.modules.sitemaps import generate_sitemaps from grimoiressg.modules.tags import extract_tags from grimoiressg.modules.templating import render_templates available_modules = { "tags": extract_tags, "markdown": compile_markdown, - "templating": render_templates + "templating": render_templates, + "sitemaps": generate_sitemaps, } diff --git a/grimoiressg/modules/markdown.py b/grimoiressg/modules/markdown.py index 2202277..8bcecfe 100644 --- a/grimoiressg/modules/markdown.py +++ b/grimoiressg/modules/markdown.py @@ -3,7 +3,7 @@ import markdown from grimoiressg.utils import logger -def compile_markdown(data, context): +def compile_markdown(data, context, config): for entry in data: if "markdown" in entry: logger.debug("Compiling markdown for %s...", entry['relative_filename']) diff --git a/grimoiressg/modules/sitemaps.py b/grimoiressg/modules/sitemaps.py new file mode 100644 index 0000000..264e36f --- /dev/null +++ b/grimoiressg/modules/sitemaps.py @@ -0,0 +1,97 @@ +import gzip +import os +from itertools import batched +from xml.etree import ElementTree as ET + +from grimoiressg.utils import to_relative, logger + +INDEX_FILE_STRATEGY_NONE = "none" +INDEX_FILE_STRATEGY_AUTO = "auto" + + +def sitemaps_default_config(): + return { + "file_prefix": "sitemap", + "loc_prefix": "https://example.com/", + "index_file_strategy": INDEX_FILE_STRATEGY_AUTO, + "compression": False, + } + + +def get_files_to_map(data, sitemap_config): + content_for_sitemap = filter( + lambda item: item.get("output", False) and not item.get("skip_sitemap", False), + data + ) + + if sitemap_config["index_file_strategy"] == INDEX_FILE_STRATEGY_AUTO: + # maximum number of entries is 50 000, however there is also a 50 MiB size limit + # -> make 20 000 item batches - to be safe + return list(batched(content_for_sitemap, 20000)) + else: + return [content_for_sitemap] + + +def get_sitemap_file_suffix(sitemap_config): + if sitemap_config["compression"]: + return ".yml.gz" + else: + return ".xml" + + +def save_sitemaps_file(xml_data, name, context, sitemap_config): + xml_str = ET.tostring(xml_data, encoding='utf8') + + filename = os.path.realpath(context["output_dir"] + "/" + name + get_sitemap_file_suffix(sitemap_config)) + logger.debug("Writing sitemap %s", to_relative(filename)) + + open_function = gzip.open if sitemap_config["compression"] else open + with open_function(filename, "wb") as file: + file.write(xml_str) + + +def generate_index_file(context, sitemap_config, number_of_batches): + root = ET.Element("sitemapindex", attrib={ + "xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance", + "xsi:schemaLocation": "http://www.sitemaps.org/schemas/sitemap/0.9 " + "http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd", + "xmlns": "http://www.sitemaps.org/schemas/sitemap/0.9" + }) + + for i in range(1, number_of_batches + 1): + sitemap = ET.SubElement(root, "sitemap") + loc = ET.SubElement(sitemap, "loc") + loc.text = sitemap_config["loc_prefix"] + sitemap_config["file_prefix"] + str(i) + get_sitemap_file_suffix( + sitemap_config) + + save_sitemaps_file(root, sitemap_config["file_prefix"], context, sitemap_config) + + +def generate_sitemaps_file(batch, name, context, sitemap_config): + root = ET.Element("urlset", attrib={ + "xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance", + "xsi:schemaLocation": "http://www.sitemaps.org/schemas/sitemap/0.9 " + "http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd", + "xmlns": "http://www.sitemaps.org/schemas/sitemap/0.9" + }) + + for entry in batch: + url = ET.SubElement(root, "url") + loc = ET.SubElement(url, "loc") + loc.text = sitemap_config["loc_prefix"] + entry["output"] + + save_sitemaps_file(root, name, context, sitemap_config) + + +def generate_sitemaps(data, context, config): + sitemaps_config = sitemaps_default_config() + sitemaps_config.update(config.get('sitemaps', {})) + + batches = get_files_to_map(data, sitemaps_config) + if len(batches) > 1: + logger.info("Entry limit exceeded; generating index file...") + generate_index_file(context, sitemaps_config, len(batches)) + for i, batch in enumerate(batches): + generate_sitemaps_file(batch, sitemaps_config["file_prefix"] + str(i + 1), context, sitemaps_config) + else: + generate_sitemaps_file(batches[0], sitemaps_config["file_prefix"], context, sitemaps_config) diff --git a/grimoiressg/modules/tags.py b/grimoiressg/modules/tags.py index 1d63cf7..331cc22 100644 --- a/grimoiressg/modules/tags.py +++ b/grimoiressg/modules/tags.py @@ -1,7 +1,7 @@ from grimoiressg.utils import logger -def extract_tags(data, context): +def extract_tags(data, context, config): tags = {} for entry in data: diff --git a/grimoiressg/modules/templating.py b/grimoiressg/modules/templating.py index ef2e307..4dec8ad 100644 --- a/grimoiressg/modules/templating.py +++ b/grimoiressg/modules/templating.py @@ -9,7 +9,7 @@ jinja_env = Environment( ) -def render_templates(data, context): +def render_templates(data, context, config): files_written = 0 for entry in data: