openzim · benoit74 · Dec 17, 2024 · Nov 22, 2024 · Dec 10, 2024 · Dec 12, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,24 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Renamed `filesystem.validate_zimfile_creatable` to `filesystem.file_creatable` to reflect general applicability to check file creation beyond ZIM files #200
 - Remove any "ZIM" reference in exceptions while working with files #200
+- Significantly enhance the safety of metadata manipulation (#205)
+  - add types for all metadata, one type per metadata name plus some generic ones for non-standard metadata
+    - all types are responsible to validate metadata value at initialization time
+    - validation checks for adherence to the ZIM specification and conventions are automated
+    - cleanup of unwanted control characters and stripping white characters are **automated in all text metadata**
+    - whenever possible, try to **automatically clean a "reasonably" bad metadata** (e.g. automaticall accept and remove duplicate tags - harmless - but not duplicate language codes - codes are supposed to be ordered, so it is a weird situation) ; this is an alignment of paradigm, because for some metadata the lib was permissive, while for other it was quite restrictive ; this PR tries to align this and **make the lib as permissive as possible**, avoiding to fail a scraper for something which could be automatically fixed
+    - it is now possible to disable ZIM conventions checks with `zim.metadata.check_metadata_conventions`
+  - simplify `zim.creator.Creator.config_metadata` by using these types and been more strict:
+    - add new `StandardMetadata` class for standard metadata, including list of mandatory one
+    - by default, all non-standard metadata must start with `X-` prefix
+      - this not yet an openZIM convention / specification, so it is possible to disable this check with `fail_on_missing_prefix` argument
+  - simplify `add_metadata`, use same metadata types
+  - simplify `zim.creator.Creator.start` with new types, and drop all metadata from memory after being passed to the libzim
+  - drop `zim.creator.convert_and_check_metadata` (not usefull anymore, simply use proper metadata type)
+  - move `MANDATORY_ZIM_METADATA_KEYS` and `DEFAULT_DEV_ZIM_METADATA` from `constants` to `zim.metadata` to avoid circular dependencies
+  - new `inputs.unique_values` utility function to compute the list of uniques values from a given list, but preserving initial list order
+  - in `__init__` of `zim.creator.Creator`, rename `disable_metadata_checks` to `check_metadata_conventions` for clarity and brevity
+    - beware that this manipulate the global `zim.metadata.check_metadata_conventions`, so if you have many creator running in parallel, they can't have different settings, last one initialized will "win"
 
 ### Added
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -26,6 +26,7 @@ dependencies = [
   "regex>=2020.7.14",
   "pymupdf>=1.24.0,<2.0",
   "CairoSVG>=2.2.0,<3.0",
+  "beartype==0.19.0",
   # youtube-dl should be updated as frequently as possible
   "yt-dlp"
 ]
@@ -52,19 +53,19 @@ scripts = [
 ]
 lint = [
   "black==24.10.0",
-  "ruff==0.7.0",
+  "ruff==0.8.2",
 ]
 check = [
-  "pyright==1.1.385",
-  "pytest==8.3.3",
+  "pyright==1.1.390",
+  "pytest==8.3.4",
 ]
 test = [
-  "pytest==8.3.3",
+  "pytest==8.3.4",
   "pytest-mock==3.14.0",
-  "coverage==7.5.3",
+  "coverage==7.6.9",
 ]
 dev = [
-  "ipython==8.25.0",
+  "ipython==8.30.0",
   "pre-commit==4.0.1",
   "zimscraperlib[scripts]",
   "zimscraperlib[lint]",
@@ -252,6 +253,8 @@ ban-relative-imports = "all"
 "tests/**/*" = ["PLR2004", "S101", "TID252"]
 # _libkiwix mimics libkiwix C++ code, names obey C++ conventions
 "src/zimscraperlib/zim/_libkiwix.py" = ["N802", "N803", "N806"]
+# beartype must be first
+"src/zimscraperlib/zim/__init__.py" = ["E402"]
 
 [tool.pytest.ini_options]
 minversion = "7.3"

diff --git a/src/zimscraperlib/constants.py b/src/zimscraperlib/constants.py
@@ -1,7 +1,6 @@
 #!/usr/bin/env python3
 # vim: ai ts=4 sts=4 et sw=4 nu
 
-import base64
 import pathlib
 import re
 
@@ -22,34 +21,6 @@
 # list of mimetypes we consider articles using it should default to FRONT_ARTICLE
 FRONT_ARTICLE_MIMETYPES = ["text/html"]
 
-# list of mandatory meta tags of the zim file.
-MANDATORY_ZIM_METADATA_KEYS = [
-    "Name",
-    "Title",
-    "Creator",
-    "Publisher",
-    "Date",
-    "Description",
-    "Language",
-    "Illustration_48x48@1",
-]
-
-DEFAULT_DEV_ZIM_METADATA = {
-    "Name": "Test Name",
-    "Title": "Test Title",
-    "Creator": "Test Creator",
-    "Publisher": "Test Publisher",
-    "Date": "2023-01-01",
-    "Description": "Test Description",
-    "Language": "fra",
-    # blank 48x48 transparent PNG
-    "Illustration_48x48_at_1": base64.b64decode(
-        "iVBORw0KGgoAAAANSUhEUgAAADAAAAAwAQMAAABtzGvEAAAAGXRFWHRTb2Z0d2FyZQBB"
-        "ZG9iZSBJbWFnZVJlYWR5ccllPAAAAANQTFRFR3BMgvrS0gAAAAF0Uk5TAEDm2GYAAAAN"
-        "SURBVBjTY2AYBdQEAAFQAAGn4toWAAAAAElFTkSuQmCC"
-    ),
-}
-
 RECOMMENDED_MAX_TITLE_LENGTH = 30
 MAXIMUM_DESCRIPTION_METADATA_LENGTH = 80
 MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH = 4000

diff --git a/src/zimscraperlib/filesystem.py b/src/zimscraperlib/filesystem.py
@@ -9,8 +9,6 @@
 
 import os
 import pathlib
-from collections.abc import Callable
-from typing import Any
 
 import magic
 
@@ -44,15 +42,7 @@ def get_content_mimetype(content: bytes | str) -> str:
     return MIME_OVERRIDES.get(detected_mime, detected_mime)
 
 
-def delete_callback(
-    fpath: str | pathlib.Path,
-    callback: Callable | None = None,
-    *callback_args: Any,
-):
-    """helper deleting passed filepath, optionnaly calling an additional callback"""
+def delete_callback(fpath: str | pathlib.Path):
+    """helper deleting passed filepath"""
 
     os.unlink(fpath)
-
-    # call the callback if requested
-    if callback and callable(callback):
-        callback.__call__(*callback_args)
diff --git a/src/zimscraperlib/image/conversion.py b/src/zimscraperlib/image/conversion.py
@@ -37,7 +37,7 @@ def convert_image(
     if not fmt:
         raise ValueError("Impossible to guess destination image format")
     with pilopen(src) as image:
-        if image.mode == "RGBA" and fmt in ALPHA_NOT_SUPPORTED or colorspace:
+        if (image.mode == "RGBA" and fmt in ALPHA_NOT_SUPPORTED) or colorspace:
             image = image.convert(colorspace or "RGB")  # noqa: PLW2901
         save_image(image, dst, fmt, **params)
 

diff --git a/src/zimscraperlib/inputs.py b/src/zimscraperlib/inputs.py
@@ -136,3 +136,8 @@ def compute_tags(
     return {
         tag.strip() for tag in list(default_tags) + (user_tags or "").split(";") if tag
     }
+
+
+def unique_values(items: list) -> list:
+    """Return unique values in input list while preserving list order"""
+    return list(dict.fromkeys(items))
diff --git a/src/zimscraperlib/typing.py b/src/zimscraperlib/typing.py
@@ -0,0 +1,26 @@
+from __future__ import annotations
+
+from collections.abc import Callable
+from typing import Any, NamedTuple
+
+
+class Callback(NamedTuple):
+    func: Callable
+    args: tuple[Any, ...] | None = None
+    kwargs: dict[str, Any] | None = None
+
+    @property
+    def callable(self) -> bool:
+        return callable(self.func)
+
+    def get_args(self) -> tuple[Any, ...]:
+        return self.args or ()
+
+    def get_kwargs(self) -> dict[str, Any]:
+        return self.kwargs or {}
+
+    def call_with(self, *args, **kwargs):
+        self.func.__call__(*args, **kwargs)
+
+    def call(self):
+        self.call_with(*self.get_args(), **self.get_kwargs())
diff --git a/src/zimscraperlib/zim/__init__.py b/src/zimscraperlib/zim/__init__.py
@@ -9,8 +9,11 @@
     zim.items: item to add to creator
     zim.archive: read ZIM files, accessing or searching its content"""
 
+from beartype.claw import beartype_this_package
 from libzim.writer import Blob  # pyright: ignore
 
+beartype_this_package()
+
 from zimscraperlib.zim.archive import Archive
 from zimscraperlib.zim.creator import Creator
 from zimscraperlib.zim.filesystem import make_zim_file
@@ -24,14 +27,14 @@
 
 __all__ = [
     "Archive",
+    "Blob",
     "Creator",
-    "make_zim_file",
+    "FileLikeProvider",
+    "FileProvider",
     "Item",
     "StaticItem",
-    "URLItem",
-    "FileProvider",
     "StringProvider",
-    "FileLikeProvider",
+    "URLItem",
     "URLProvider",
-    "Blob",
+    "make_zim_file",
 ]
diff --git a/src/zimscraperlib/zim/_libkiwix.py b/src/zimscraperlib/zim/_libkiwix.py
@@ -15,12 +15,15 @@
 from __future__ import annotations
 
 import io
-from collections import namedtuple
+from typing import NamedTuple
 
-MimetypeAndCounter = namedtuple("MimetypeAndCounter", ["mimetype", "value"])
-CounterMap = dict[
-    type(MimetypeAndCounter.mimetype), type(MimetypeAndCounter.value)  # pyright: ignore
-]
+
+class MimetypeAndCounter(NamedTuple):
+    mimetype: str
+    value: int
+
+
+type CounterMap = dict[str, int]
 
 
 def getline(src: io.StringIO, delim: str | None = None) -> tuple[bool, str]:

diff --git a/src/zimscraperlib/zim/archive.py b/src/zimscraperlib/zim/archive.py
@@ -17,7 +17,7 @@
 import libzim.search  # Query, Searcher  # pyright: ignore
 import libzim.suggestion  # SuggestionSearcher  # pyright: ignore
 
-from zimscraperlib.zim._libkiwix import convertTags, parseMimetypeCounter
+from zimscraperlib.zim._libkiwix import CounterMap, convertTags, parseMimetypeCounter
 
 
 class Archive(libzim.reader.Archive):
@@ -101,7 +101,7 @@ def get_search_results_count(self, query: str) -> int:
         return search.getEstimatedMatches()
 
     @property
-    def counters(self) -> dict[str, int]:
+    def counters(self) -> CounterMap:
         try:
             return parseMimetypeCounter(self.get_text_metadata("Counter"))
         except RuntimeError:  # pragma: no cover (no ZIM avail to test itl)