aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorRalph Amissah <ralph.amissah@gmail.com>2026-04-13 15:33:07 -0400
committerRalph Amissah <ralph.amissah@gmail.com>2026-04-13 16:25:56 -0400
commitd0ac448e6425c9e4246cd529aeb11643dce8093f (patch)
treec12356fbc55cffd495cc37b81ca6fb125e3be195
parentpackage.nix cosmetic line-breaks for build command (diff)
spine may be run against a document-markup zip pod
- claude contributed src - Opens the zip with std.zip.ZipArchive (reads the whole file into memory) - Locates pod.manifest inside the archive to discover document paths and languages - Extracts markup files (.sst/.ssm/.ssi) as in-memory strings - Extracts images as in-memory byte arrays - Extracts conf/dr_document_make if present - Presents these to the existing pipeline as if they were read from the filesystem - Some security mitigations: - Zip Slip / Path Traversal: Reject entries containing `..` or starting with `/`; canonicalize resolved paths and verify they fall within extraction root - Zip Bomb: Check `ArchiveMember.size` before extracting; enforce per-file (50MB) and total size limits (500MB) - Entry Count: Limit number of entries (a pod should have at most ~100 files) - Path depth: limit (Maximum 10 path components). - Symlinks: Verify no symlinks in extracted content before processing (post-extraction recursive scan) - Filename Validation: Only allow expected characters; reject null bytes - Malformed Zips: Catch `ZipException` from `std.zip.ZipArchive` constructor - Cleanup on error
-rw-r--r--org/in_zip_pod.org283
-rw-r--r--org/spine.org180
-rw-r--r--src/sisudoc/io_in/read_zip_pod.d279
-rwxr-xr-xsrc/sisudoc/spine.d180
4 files changed, 918 insertions, 4 deletions
diff --git a/org/in_zip_pod.org b/org/in_zip_pod.org
new file mode 100644
index 0000000..bc5b069
--- /dev/null
+++ b/org/in_zip_pod.org
@@ -0,0 +1,283 @@
+-*- mode: org -*-
+#+TITLE: sisudoc spine (doc_reform) markup source zip pod
+#+DESCRIPTION: documents - structuring, publishing in multiple formats & search
+#+FILETAGS: :spine:sourcefile:read:
+#+AUTHOR: Ralph Amissah
+#+EMAIL: [[mailto:ralph.amissah@gmail.com][ralph.amissah@gmail.com]]
+#+COPYRIGHT: Copyright (C) 2015 (continuously updated, current 2026) Ralph Amissah
+#+LANGUAGE: en
+#+STARTUP: content hideblocks hidestars noindent entitiespretty
+#+PROPERTY: header-args :exports code
+#+PROPERTY: header-args+ :noweb yes
+#+PROPERTY: header-args+ :results silent
+#+PROPERTY: header-args+ :cache no
+#+PROPERTY: header-args+ :padline no
+#+PROPERTY: header-args+ :mkdirp yes
+#+OPTIONS: H:3 num:nil toc:t \n:t ::t |:t ^:nil -:t f:t *:t
+
+- [[./doc-reform.org][doc-reform.org]] [[./][org/]]
+
+* read zip
+
+#+HEADER: :tangle "../src/sisudoc/io_in/read_zip_pod.d"
+#+HEADER: :noweb yes
+#+BEGIN_SRC d
+<<doc_header_including_copyright_and_license>>
+/++
+ module read_zip_pod;<BR>
+ - extract pod zip archives to temp directory for processing<BR>
+ - validate zip entries for security (path traversal, size limits)
++/
+module sisudoc.io_in.read_zip_pod;
+@safe:
+template spineExtractZipPod() {
+ import std.algorithm : canFind;
+ import std.array : array;
+ import std.conv : to;
+ import std.file;
+ import std.path;
+ import std.regex;
+ import std.stdio;
+ import std.string : indexOf;
+
+ /+ security limits for zip extraction +/
+ enum size_t MAX_ENTRY_SIZE = 50 * 1024 * 1024; /+ 50 MB per entry +/
+ enum size_t MAX_TOTAL_SIZE = 500 * 1024 * 1024; /+ 500 MB total +/
+ enum size_t MAX_ENTRY_COUNT = 500; /+ max entries in archive +/
+ enum size_t MAX_PATH_DEPTH = 10; /+ max path components +/
+
+ /+ allowed entry name pattern: alphanumeric, dots, dashes, underscores, forward slashes +/
+ static auto rgx_safe_entry_name = ctRegex!(`^[a-zA-Z0-9._/ -]+$`);
+
+ struct ZipPodResult {
+ string tmp_dir; /+ temp directory containing extracted pod +/
+ string pod_dir; /+ path to pod directory within tmp_dir +/
+ bool ok; /+ extraction succeeded +/
+ string error_msg; /+ error description if !ok +/
+ }
+
+ /+ ↓ validate a single zip entry name for security +/
+ string validateEntryName(string name) {
+ /+ reject empty names +/
+ if (name.length == 0)
+ return "empty entry name";
+ /+ reject absolute paths +/
+ if (name[0] == '/')
+ return "absolute path in zip entry: " ~ name;
+ /+ reject path traversal +/
+ if (name.canFind(".."))
+ return "path traversal in zip entry: " ~ name;
+ /+ reject null bytes +/
+ if (name.indexOf('\0') >= 0)
+ return "null byte in zip entry name: " ~ name;
+ /+ reject backslashes (windows path separator tricks) +/
+ if (name.canFind("\\"))
+ return "backslash in zip entry: " ~ name;
+ /+ check path depth +/
+ size_t depth = 0;
+ foreach (c; name) {
+ if (c == '/') depth++;
+ }
+ if (depth > MAX_PATH_DEPTH)
+ return "path too deep in zip entry: " ~ name;
+ /+ check allowed characters +/
+ if (!(name.matchFirst(rgx_safe_entry_name)))
+ return "disallowed characters in zip entry: " ~ name;
+ return ""; /+ empty string means valid +/
+ }
+
+ /+ ↓ extract zip pod to temp directory, returns ZipPodResult +/
+ @trusted ZipPodResult extractZipPod(string zip_path) {
+ import std.zip;
+ ZipPodResult result;
+ result.ok = false;
+ /+ ↓ verify zip file exists +/
+ if (!exists(zip_path) || !zip_path.isFile) {
+ result.error_msg = "zip file not found: " ~ zip_path;
+ return result;
+ }
+ /+ ↓ derive pod name from zip filename +/
+ string zip_basename = zip_path.baseName.stripExtension;
+ /+ ↓ read and parse zip archive +/
+ ZipArchive zip;
+ try {
+ zip = new ZipArchive(read(zip_path));
+ } catch (ZipException ex) {
+ result.error_msg = "failed to read zip archive: " ~ zip_path ~ " - " ~ ex.msg;
+ return result;
+ } catch (Exception ex) {
+ result.error_msg = "error reading zip file: " ~ zip_path ~ " - " ~ ex.msg;
+ return result;
+ }
+ /+ ↓ validate entry count +/
+ if (zip.directory.length > MAX_ENTRY_COUNT) {
+ result.error_msg = "zip archive has too many entries ("
+ ~ zip.directory.length.to!string ~ " > " ~ MAX_ENTRY_COUNT.to!string ~ "): " ~ zip_path;
+ return result;
+ }
+ /+ ↓ validate all entries before extracting any +/
+ size_t total_size = 0;
+ foreach (name, member; zip.directory) {
+ /+ validate entry name +/
+ string name_err = validateEntryName(name);
+ if (name_err.length > 0) {
+ result.error_msg = name_err;
+ return result;
+ }
+ /+ check per-entry size +/
+ if (member.expandedSize > MAX_ENTRY_SIZE) {
+ result.error_msg = "zip entry too large ("
+ ~ member.expandedSize.to!string ~ " bytes): " ~ name;
+ return result;
+ }
+ /+ check total size +/
+ total_size += member.expandedSize;
+ if (total_size > MAX_TOTAL_SIZE) {
+ result.error_msg = "zip archive total size exceeds limit ("
+ ~ MAX_TOTAL_SIZE.to!string ~ " bytes): " ~ zip_path;
+ return result;
+ }
+ }
+ /+ ↓ create temp directory +/
+ string tmp_base = tempDir.buildPath("spine-zip-pod");
+ try {
+ if (!exists(tmp_base))
+ mkdirRecurse(tmp_base);
+ } catch (FileException ex) {
+ result.error_msg = "failed to create temp base directory: " ~ ex.msg;
+ return result;
+ }
+ /+ pod directory inside temp: tmp_base/pod_name/ +/
+ string pod_dir = tmp_base.buildPath(zip_basename);
+ try {
+ if (exists(pod_dir))
+ rmdirRecurse(pod_dir);
+ mkdirRecurse(pod_dir);
+ } catch (FileException ex) {
+ result.error_msg = "failed to create temp pod directory: " ~ ex.msg;
+ return result;
+ }
+ /+ ↓ extract entries +/
+ /+ zip internal structure uses paths like:
+ pod.manifest, conf/dr_document_make,
+ pod/media/text/en/filename.sst, image/filename.png
+ but the extracted pod directory needs to look like a normal pod:
+ pod.manifest, conf/dr_document_make,
+ media/text/en/filename.sst, image/filename.png
+ The "pod/" prefix in zip entries for text files maps to the pod root.
+ +/
+ /+ ↓ pre-compute canonical pod path for containment checks +/
+ auto canonical_pod = (pod_dir.asNormalizedPath).array.to!string ~ "/";
+ foreach (name, member; zip.directory) {
+ /+ skip directory entries +/
+ if (name.length > 0 && name[$-1] == '/')
+ continue;
+ /+ ↓ map zip internal path to filesystem path +/
+ /+ entries with "pod/" prefix: strip it so media/text/en/file.sst ends up at pod_dir/media/text/en/file.sst +/
+ string entry_path = name;
+ if (entry_path.length > 4 && entry_path[0..4] == "pod/") {
+ entry_path = entry_path[4..$];
+ }
+ string out_path = pod_dir.buildPath(entry_path);
+ /+ ↓ verify resolved path is within pod_dir (defense in depth) +/
+ auto canonical_out = (out_path.asNormalizedPath).array.to!string;
+ if (canonical_out.length < canonical_pod.length
+ || canonical_out[0..canonical_pod.length] != canonical_pod)
+ {
+ result.error_msg = "zip entry escapes extraction directory: " ~ name;
+ try { rmdirRecurse(pod_dir); } catch (FileException) {}
+ return result;
+ }
+ /+ ↓ create parent directories +/
+ string parent = out_path.dirName;
+ try {
+ if (!exists(parent))
+ mkdirRecurse(parent);
+ } catch (FileException ex) {
+ result.error_msg = "failed to create directory for: " ~ name ~ " - " ~ ex.msg;
+ try { rmdirRecurse(pod_dir); } catch (FileException) {}
+ return result;
+ }
+ /+ ↓ decompress and write file +/
+ try {
+ auto data = zip.expand(member);
+ std.file.write(out_path, data);
+ } catch (Exception ex) {
+ result.error_msg = "failed to extract: " ~ name ~ " - " ~ ex.msg;
+ try { rmdirRecurse(pod_dir); } catch (FileException) {}
+ return result;
+ }
+ }
+ /+ ↓ verify no symlinks were created (defense in depth) +/
+ string symlink_err = checkForSymlinks(pod_dir);
+ if (symlink_err.length > 0) {
+ result.error_msg = symlink_err;
+ try { rmdirRecurse(pod_dir); } catch (FileException) {}
+ return result;
+ }
+ /+ ↓ verify pod.manifest exists in extracted content +/
+ if (!exists(pod_dir.buildPath("pod.manifest"))) {
+ result.error_msg = "zip archive does not contain pod.manifest: " ~ zip_path;
+ try { rmdirRecurse(pod_dir); } catch (FileException) {}
+ return result;
+ }
+ result.tmp_dir = tmp_base;
+ result.pod_dir = pod_dir;
+ result.ok = true;
+ return result;
+ }
+
+ /+ ↓ recursively check for symlinks in extracted directory +/
+ @trusted string checkForSymlinks(string dir_path) {
+ try {
+ foreach (entry; dirEntries(dir_path, SpanMode.depth)) {
+ if (entry.isSymlink) {
+ return "symlink found in zip extraction: " ~ entry.name;
+ }
+ }
+ } catch (FileException ex) {
+ return "error checking for symlinks: " ~ ex.msg;
+ }
+ return "";
+ }
+
+ /+ ↓ clean up extracted temp directory +/
+ void cleanupZipPod(ref ZipPodResult zpr) {
+ if (zpr.pod_dir.length > 0 && exists(zpr.pod_dir)) {
+ try {
+ rmdirRecurse(zpr.pod_dir);
+ } catch (FileException ex) {
+ stderr.writeln("WARNING: failed to clean up temp zip extraction: ", zpr.pod_dir);
+ }
+ }
+ zpr.ok = false;
+ }
+}
+#+END_SRC
+
+* org includes
+** project version
+
+#+NAME: spine_version
+#+HEADER: :noweb yes
+#+BEGIN_SRC emacs-lisp
+<<./sisudoc_spine_version_info_and_doc_header_including_copyright_and_license.org:spine_project_version()>>
+#+END_SRC
+
+** year
+
+#+NAME: year
+#+HEADER: :noweb yes
+#+BEGIN_SRC emacs-lisp
+<<./sisudoc_spine_version_info_and_doc_header_including_copyright_and_license.org:year()>>
+#+END_SRC
+
+** document header including copyright & license
+
+#+NAME: doc_header_including_copyright_and_license
+#+HEADER: :noweb yes
+#+BEGIN_SRC emacs-lisp
+<<./sisudoc_spine_version_info_and_doc_header_including_copyright_and_license.org:spine_doc_header_including_copyright_and_license()>>
+#+END_SRC
+
+* __END__
diff --git a/org/spine.org b/org/spine.org
index c46c8b1..c218df0 100644
--- a/org/spine.org
+++ b/org/spine.org
@@ -115,6 +115,10 @@ string program_name = "spine";
}
}
} // else { writeln("NO METADATA CURATED"); }
+ /+ ↓ clean up any extracted zip pod temp directories +/
+ foreach (ref _zpr; _zip_pod_extractions) {
+ cleanupZipPod(_zpr);
+ }
}
#+END_SRC
@@ -148,6 +152,7 @@ import sisudoc.meta.rgx_files;
import sisudoc.io_in.paths_source;
import sisudoc.io_in.read_config_files;
import sisudoc.io_in.read_source_files;
+import sisudoc.io_in.read_zip_pod;
import sisudoc.io_out.hub;
#+END_SRC
@@ -1057,6 +1062,9 @@ auto _env = [
auto _manifested = PathMatters!()(_opt_action, _env, "");
auto _manifests = [ _manifested ];
auto _conf_file_details = configFilePaths!()(_manifested, _env, _opt_action.config_path_set);
+/+ ↓ track extracted zip pod temp directories for cleanup +/
+mixin spineExtractZipPod;
+ZipPodResult[] _zip_pod_extractions;
ConfComposite _siteConfig;
if (
_opt_action.require_processing_files
@@ -1064,7 +1072,16 @@ if (
) {
foreach(arg; args[1..$]) {
if (!(arg.match(rgx.flag_action))) { /+ cli markup source path +/ // get first input markup source file names for processing
- _manifested = PathMatters!()(_opt_action, _env, arg);
+ string _config_arg = arg;
+ /+ ↓ if first non-flag arg is a zip, extract for config discovery +/
+ if (arg.match(rgx_files.src_pth_zip)) {
+ auto _zpr = extractZipPod(arg);
+ if (_zpr.ok) {
+ _zip_pod_extractions ~= _zpr;
+ _config_arg = _zpr.pod_dir;
+ }
+ }
+ _manifested = PathMatters!()(_opt_action, _env, _config_arg);
{ /+ local site config +/
_conf_file_details = configFilePaths!()(_manifested, _env, _opt_action.config_path_set);
auto _config_local_site_struct = readConfigSite!()(_conf_file_details, _opt_action, _cfg);
@@ -1260,7 +1277,166 @@ foreach(arg; args[1..$]) {
_manifests ~= _manifested;
}
} else if (arg.match(rgx_files.src_pth_zip)) {
- // fns_src ~= arg; // gather input markup source file names for processing
+ /+ ↓ zip pod archive: extract to temp dir, process as pod +/
+ /+ check if this zip was already extracted during config discovery +/
+ string _zip_pod_dir;
+ foreach (ref _zpr; _zip_pod_extractions) {
+ if (_zpr.ok && _zpr.pod_dir.length > 0
+ && _zpr.pod_dir.baseName == arg.baseName.stripExtension)
+ {
+ _zip_pod_dir = _zpr.pod_dir;
+ break;
+ }
+ }
+ if (_zip_pod_dir.length == 0) {
+ auto _zpr = extractZipPod(arg);
+ if (!_zpr.ok) {
+ writeln("ERROR >> Processing Skipped! Zip extraction failed: ", arg, " - ", _zpr.error_msg);
+ } else {
+ _zip_pod_extractions ~= _zpr;
+ _zip_pod_dir = _zpr.pod_dir;
+ }
+ }
+ if (_zip_pod_dir.length > 0) {
+ /+ process extracted pod directory same as regular pod +/
+ auto _zip_manifest = PodManifest!()(_opt_action, _zip_pod_dir);
+ if (_zip_manifest.pod_manifest_file_with_path
+ && _opt_action.abstraction
+ ) {
+ string pod_manifest_root_content_paths_to_markup_location_raw_;
+ string markup_contents_location_;
+ string sisudoc_txt_ = _zip_manifest.pod_manifest_file_with_path;
+ enforce(
+ exists(sisudoc_txt_)!=0,
+ "file not found: <<" ~
+ sisudoc_txt_ ~ ">>"
+ );
+ if (exists(sisudoc_txt_)) {
+ try {
+ import dyaml;
+ Node pod_manifest_yaml;
+ try {
+ pod_manifest_yaml = Loader.fromFile(sisudoc_txt_).load();
+ } catch (ErrnoException ex) {
+ } catch (FileException ex) {
+ writeln("ERROR failed to read config file");
+ } catch (Throwable) {
+ writeln("ERROR failed to read config file content, not parsed as yaml");
+ }
+ if ("doc" in pod_manifest_yaml) {
+ if (pod_manifest_yaml["doc"].type.mapping
+ && pod_manifest_yaml["doc"].tag.match(rgx_y.yaml_tag_is_map)
+ ) {
+ if ("path" in pod_manifest_yaml["doc"]) {
+ if (pod_manifest_yaml["doc"]["path"].tag.match(rgx_y.yaml_tag_is_seq)) {
+ foreach (string _path; pod_manifest_yaml["doc"]["path"]) {
+ markup_contents_location_ ~= _path ~ "\n";
+ pod_manifest_root_content_paths_to_markup_location_raw_ ~=
+ _path ~ "\n";
+ }
+ } else if (
+ pod_manifest_yaml["doc"]["path"].type.string
+ && pod_manifest_yaml["doc"]["path"].tag.match(rgx_y.yaml_tag_is_str)
+ ) {
+ markup_contents_location_ = pod_manifest_yaml["doc"]["path"].get!string;
+ pod_manifest_root_content_paths_to_markup_location_raw_ =
+ pod_manifest_yaml["doc"]["path"].get!string;
+ }
+ }
+ if ("filename" in pod_manifest_yaml["doc"]) {
+ if (pod_manifest_yaml["doc"]["filename"].tag.match(rgx_y.yaml_tag_is_seq)) {
+ foreach (string _filename; pod_manifest_yaml["doc"]["filename"]) {
+ if ("language" in pod_manifest_yaml["doc"]) {
+ if (pod_manifest_yaml["doc"]["language"].tag.match(rgx_y.yaml_tag_is_seq)) {
+ foreach (string _lang; pod_manifest_yaml["doc"]["language"]) {
+ markup_contents_location_ ~=
+ "media/text/"
+ ~ _lang ~ "/"
+ ~ _filename ~ "\n";
+ }
+ } else if (pod_manifest_yaml["doc"]["language"].tag.match(rgx_y.yaml_tag_is_str)
+ ) {
+ markup_contents_location_ =
+ "media/text/"
+ ~ pod_manifest_yaml["doc"]["language"].get!string
+ ~ "/" ~ _filename ~ "\n";
+ } else {
+ string _lang_default = "en";
+ markup_contents_location_ ~=
+ "media/text/"
+ ~ _lang_default ~ "/"
+ ~ pod_manifest_yaml["doc"]["filename"].get!string ~ "\n";
+ }
+ } else {
+ string _lang_default = "en";
+ markup_contents_location_ ~=
+ "media/text/"
+ ~ _lang_default ~ "/"
+ ~ pod_manifest_yaml["doc"]["filename"].get!string ~ "\n";
+ }
+ }
+ } else if (
+ pod_manifest_yaml["doc"]["filename"].type.string
+ && pod_manifest_yaml["doc"]["filename"].tag.match(rgx_y.yaml_tag_is_str)
+ ) {
+ if ("language" in pod_manifest_yaml["doc"]) {
+ if (pod_manifest_yaml["doc"]["language"].tag.match(rgx_y.yaml_tag_is_seq)) {
+ foreach (string _lang; pod_manifest_yaml["doc"]["language"]) {
+ markup_contents_location_ ~=
+ "media/text/"
+ ~ _lang ~ "/"
+ ~ pod_manifest_yaml["doc"]["filename"].get!string ~ "\n";
+ }
+ } else if (pod_manifest_yaml["doc"]["language"].tag.match(rgx_y.yaml_tag_is_str)) {
+ markup_contents_location_ =
+ "media/text/"
+ ~ pod_manifest_yaml["doc"]["language"].get!string
+ ~ "/" ~ pod_manifest_yaml["doc"]["filename"].get!string ~ "\n";
+ } else {
+ string _lang_default = "en";
+ markup_contents_location_ ~=
+ "media/text/"
+ ~ _lang_default ~ "/"
+ ~ pod_manifest_yaml["doc"]["filename"].get!string ~ "\n";
+ }
+ } else {
+ string _lang_default = "en";
+ markup_contents_location_ ~=
+ "media/text/"
+ ~ _lang_default ~ "/"
+ ~ pod_manifest_yaml["doc"]["filename"].get!string ~ "\n";
+ }
+ }
+ }
+ }
+ }
+ } catch (ErrnoException ex) {
+ } catch (FileException ex) {
+ // Handle errors
+ }
+ } else {
+ writeln("manifest not found: ", sisudoc_txt_);
+ }
+ auto markup_contents_locations_arr
+ = (cast(char[]) markup_contents_location_).split;
+ auto tmp_dir_ = (sisudoc_txt_).dirName.array;
+ foreach (markup_contents_location; markup_contents_locations_arr) {
+ assert(markup_contents_location.match(rgx_files.src_pth_sst_or_ssm),
+ "not a recognised file: <<" ~
+ markup_contents_location ~ ">>"
+ );
+ auto markup_contents_location_pth_ = (markup_contents_location).to!string;
+ Regex!(char) lang_rgx_ = regex(r"/(" ~ _opt_action.languages_set.join("|") ~ ")/");
+ if (_opt_action.languages_set[0] == "all"
+ || (markup_contents_location_pth_).match(lang_rgx_)
+ ) {
+ auto _fns = (((tmp_dir_).chainPath(markup_contents_location_pth_)).array).to!string;
+ _manifested = PathMatters!()(_opt_action, _env, _zip_pod_dir, _fns, markup_contents_locations_arr);
+ _manifests ~= _manifested;
+ }
+ }
+ }
+ }
} else { // anything remaining, unused
arg_unrecognized ~= " " ~ arg;
}
diff --git a/src/sisudoc/io_in/read_zip_pod.d b/src/sisudoc/io_in/read_zip_pod.d
new file mode 100644
index 0000000..38480cd
--- /dev/null
+++ b/src/sisudoc/io_in/read_zip_pod.d
@@ -0,0 +1,279 @@
+/+
+- Name: SisuDoc Spine, Doc Reform [a part of]
+ - Description: documents, structuring, processing, publishing, search
+ - static content generator
+
+ - Author: Ralph Amissah
+ [ralph.amissah@gmail.com]
+
+ - Copyright: (C) 2015 (continuously updated, current 2026) Ralph Amissah, All Rights Reserved.
+
+ - License: AGPL 3 or later:
+
+ Spine (SiSU), a framework for document structuring, publishing and
+ search
+
+ Copyright (C) Ralph Amissah
+
+ This program is free software: you can redistribute it and/or modify it
+ under the terms of the GNU AFERO General Public License as published by the
+ Free Software Foundation, either version 3 of the License, or (at your
+ option) any later version.
+
+ This program is distributed in the hope that it will be useful, but WITHOUT
+ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ You should have received a copy of the GNU General Public License along with
+ this program. If not, see [https://www.gnu.org/licenses/].
+
+ If you have Internet connection, the latest version of the AGPL should be
+ available at these locations:
+ [https://www.fsf.org/licensing/licenses/agpl.html]
+ [https://www.gnu.org/licenses/agpl.html]
+
+ - Spine (by Doc Reform, related to SiSU) uses standard:
+ - docReform markup syntax
+ - standard SiSU markup syntax with modified headers and minor modifications
+ - docReform object numbering
+ - standard SiSU object citation numbering & system
+
+ - Homepages:
+ [https://www.sisudoc.org]
+ [https://www.doc-reform.org]
+
+ - Git
+ [https://git.sisudoc.org/]
+
++/
+/++
+ module read_zip_pod;<BR>
+ - extract pod zip archives to temp directory for processing<BR>
+ - validate zip entries for security (path traversal, size limits)
++/
+module sisudoc.io_in.read_zip_pod;
+@safe:
+template spineExtractZipPod() {
+ import std.algorithm : canFind;
+ import std.array : array;
+ import std.conv : to;
+ import std.file;
+ import std.path;
+ import std.regex;
+ import std.stdio;
+ import std.string : indexOf;
+
+ /+ security limits for zip extraction +/
+ enum size_t MAX_ENTRY_SIZE = 50 * 1024 * 1024; /+ 50 MB per entry +/
+ enum size_t MAX_TOTAL_SIZE = 500 * 1024 * 1024; /+ 500 MB total +/
+ enum size_t MAX_ENTRY_COUNT = 500; /+ max entries in archive +/
+ enum size_t MAX_PATH_DEPTH = 10; /+ max path components +/
+
+ /+ allowed entry name pattern: alphanumeric, dots, dashes, underscores, forward slashes +/
+ static auto rgx_safe_entry_name = ctRegex!(`^[a-zA-Z0-9._/ -]+$`);
+
+ struct ZipPodResult {
+ string tmp_dir; /+ temp directory containing extracted pod +/
+ string pod_dir; /+ path to pod directory within tmp_dir +/
+ bool ok; /+ extraction succeeded +/
+ string error_msg; /+ error description if !ok +/
+ }
+
+ /+ ↓ validate a single zip entry name for security +/
+ string validateEntryName(string name) {
+ /+ reject empty names +/
+ if (name.length == 0)
+ return "empty entry name";
+ /+ reject absolute paths +/
+ if (name[0] == '/')
+ return "absolute path in zip entry: " ~ name;
+ /+ reject path traversal +/
+ if (name.canFind(".."))
+ return "path traversal in zip entry: " ~ name;
+ /+ reject null bytes +/
+ if (name.indexOf('\0') >= 0)
+ return "null byte in zip entry name: " ~ name;
+ /+ reject backslashes (windows path separator tricks) +/
+ if (name.canFind("\\"))
+ return "backslash in zip entry: " ~ name;
+ /+ check path depth +/
+ size_t depth = 0;
+ foreach (c; name) {
+ if (c == '/') depth++;
+ }
+ if (depth > MAX_PATH_DEPTH)
+ return "path too deep in zip entry: " ~ name;
+ /+ check allowed characters +/
+ if (!(name.matchFirst(rgx_safe_entry_name)))
+ return "disallowed characters in zip entry: " ~ name;
+ return ""; /+ empty string means valid +/
+ }
+
+ /+ ↓ extract zip pod to temp directory, returns ZipPodResult +/
+ @trusted ZipPodResult extractZipPod(string zip_path) {
+ import std.zip;
+ ZipPodResult result;
+ result.ok = false;
+ /+ ↓ verify zip file exists +/
+ if (!exists(zip_path) || !zip_path.isFile) {
+ result.error_msg = "zip file not found: " ~ zip_path;
+ return result;
+ }
+ /+ ↓ derive pod name from zip filename +/
+ string zip_basename = zip_path.baseName.stripExtension;
+ /+ ↓ read and parse zip archive +/
+ ZipArchive zip;
+ try {
+ zip = new ZipArchive(read(zip_path));
+ } catch (ZipException ex) {
+ result.error_msg = "failed to read zip archive: " ~ zip_path ~ " - " ~ ex.msg;
+ return result;
+ } catch (Exception ex) {
+ result.error_msg = "error reading zip file: " ~ zip_path ~ " - " ~ ex.msg;
+ return result;
+ }
+ /+ ↓ validate entry count +/
+ if (zip.directory.length > MAX_ENTRY_COUNT) {
+ result.error_msg = "zip archive has too many entries ("
+ ~ zip.directory.length.to!string ~ " > " ~ MAX_ENTRY_COUNT.to!string ~ "): " ~ zip_path;
+ return result;
+ }
+ /+ ↓ validate all entries before extracting any +/
+ size_t total_size = 0;
+ foreach (name, member; zip.directory) {
+ /+ validate entry name +/
+ string name_err = validateEntryName(name);
+ if (name_err.length > 0) {
+ result.error_msg = name_err;
+ return result;
+ }
+ /+ check per-entry size +/
+ if (member.expandedSize > MAX_ENTRY_SIZE) {
+ result.error_msg = "zip entry too large ("
+ ~ member.expandedSize.to!string ~ " bytes): " ~ name;
+ return result;
+ }
+ /+ check total size +/
+ total_size += member.expandedSize;
+ if (total_size > MAX_TOTAL_SIZE) {
+ result.error_msg = "zip archive total size exceeds limit ("
+ ~ MAX_TOTAL_SIZE.to!string ~ " bytes): " ~ zip_path;
+ return result;
+ }
+ }
+ /+ ↓ create temp directory +/
+ string tmp_base = tempDir.buildPath("spine-zip-pod");
+ try {
+ if (!exists(tmp_base))
+ mkdirRecurse(tmp_base);
+ } catch (FileException ex) {
+ result.error_msg = "failed to create temp base directory: " ~ ex.msg;
+ return result;
+ }
+ /+ pod directory inside temp: tmp_base/pod_name/ +/
+ string pod_dir = tmp_base.buildPath(zip_basename);
+ try {
+ if (exists(pod_dir))
+ rmdirRecurse(pod_dir);
+ mkdirRecurse(pod_dir);
+ } catch (FileException ex) {
+ result.error_msg = "failed to create temp pod directory: " ~ ex.msg;
+ return result;
+ }
+ /+ ↓ extract entries +/
+ /+ zip internal structure uses paths like:
+ pod.manifest, conf/dr_document_make,
+ pod/media/text/en/filename.sst, image/filename.png
+ but the extracted pod directory needs to look like a normal pod:
+ pod.manifest, conf/dr_document_make,
+ media/text/en/filename.sst, image/filename.png
+ The "pod/" prefix in zip entries for text files maps to the pod root.
+ +/
+ /+ ↓ pre-compute canonical pod path for containment checks +/
+ auto canonical_pod = (pod_dir.asNormalizedPath).array.to!string ~ "/";
+ foreach (name, member; zip.directory) {
+ /+ skip directory entries +/
+ if (name.length > 0 && name[$-1] == '/')
+ continue;
+ /+ ↓ map zip internal path to filesystem path +/
+ /+ entries with "pod/" prefix: strip it so media/text/en/file.sst ends up at pod_dir/media/text/en/file.sst +/
+ string entry_path = name;
+ if (entry_path.length > 4 && entry_path[0..4] == "pod/") {
+ entry_path = entry_path[4..$];
+ }
+ string out_path = pod_dir.buildPath(entry_path);
+ /+ ↓ verify resolved path is within pod_dir (defense in depth) +/
+ auto canonical_out = (out_path.asNormalizedPath).array.to!string;
+ if (canonical_out.length < canonical_pod.length
+ || canonical_out[0..canonical_pod.length] != canonical_pod)
+ {
+ result.error_msg = "zip entry escapes extraction directory: " ~ name;
+ try { rmdirRecurse(pod_dir); } catch (FileException) {}
+ return result;
+ }
+ /+ ↓ create parent directories +/
+ string parent = out_path.dirName;
+ try {
+ if (!exists(parent))
+ mkdirRecurse(parent);
+ } catch (FileException ex) {
+ result.error_msg = "failed to create directory for: " ~ name ~ " - " ~ ex.msg;
+ try { rmdirRecurse(pod_dir); } catch (FileException) {}
+ return result;
+ }
+ /+ ↓ decompress and write file +/
+ try {
+ auto data = zip.expand(member);
+ std.file.write(out_path, data);
+ } catch (Exception ex) {
+ result.error_msg = "failed to extract: " ~ name ~ " - " ~ ex.msg;
+ try { rmdirRecurse(pod_dir); } catch (FileException) {}
+ return result;
+ }
+ }
+ /+ ↓ verify no symlinks were created (defense in depth) +/
+ string symlink_err = checkForSymlinks(pod_dir);
+ if (symlink_err.length > 0) {
+ result.error_msg = symlink_err;
+ try { rmdirRecurse(pod_dir); } catch (FileException) {}
+ return result;
+ }
+ /+ ↓ verify pod.manifest exists in extracted content +/
+ if (!exists(pod_dir.buildPath("pod.manifest"))) {
+ result.error_msg = "zip archive does not contain pod.manifest: " ~ zip_path;
+ try { rmdirRecurse(pod_dir); } catch (FileException) {}
+ return result;
+ }
+ result.tmp_dir = tmp_base;
+ result.pod_dir = pod_dir;
+ result.ok = true;
+ return result;
+ }
+
+ /+ ↓ recursively check for symlinks in extracted directory +/
+ @trusted string checkForSymlinks(string dir_path) {
+ try {
+ foreach (entry; dirEntries(dir_path, SpanMode.depth)) {
+ if (entry.isSymlink) {
+ return "symlink found in zip extraction: " ~ entry.name;
+ }
+ }
+ } catch (FileException ex) {
+ return "error checking for symlinks: " ~ ex.msg;
+ }
+ return "";
+ }
+
+ /+ ↓ clean up extracted temp directory +/
+ void cleanupZipPod(ref ZipPodResult zpr) {
+ if (zpr.pod_dir.length > 0 && exists(zpr.pod_dir)) {
+ try {
+ rmdirRecurse(zpr.pod_dir);
+ } catch (FileException ex) {
+ stderr.writeln("WARNING: failed to clean up temp zip extraction: ", zpr.pod_dir);
+ }
+ }
+ zpr.ok = false;
+ }
+}
diff --git a/src/sisudoc/spine.d b/src/sisudoc/spine.d
index 5d3b228..ee3bcef 100755
--- a/src/sisudoc/spine.d
+++ b/src/sisudoc/spine.d
@@ -77,6 +77,7 @@ import sisudoc.meta.rgx_files;
import sisudoc.io_in.paths_source;
import sisudoc.io_in.read_config_files;
import sisudoc.io_in.read_source_files;
+import sisudoc.io_in.read_zip_pod;
import sisudoc.io_out.hub;
mixin(import("version.txt"));
mixin(import("configuration.txt"));
@@ -856,6 +857,9 @@ string program_name = "spine";
auto _manifested = PathMatters!()(_opt_action, _env, "");
auto _manifests = [ _manifested ];
auto _conf_file_details = configFilePaths!()(_manifested, _env, _opt_action.config_path_set);
+ /+ ↓ track extracted zip pod temp directories for cleanup +/
+ mixin spineExtractZipPod;
+ ZipPodResult[] _zip_pod_extractions;
ConfComposite _siteConfig;
if (
_opt_action.require_processing_files
@@ -863,7 +867,16 @@ string program_name = "spine";
) {
foreach(arg; args[1..$]) {
if (!(arg.match(rgx.flag_action))) { /+ cli markup source path +/ // get first input markup source file names for processing
- _manifested = PathMatters!()(_opt_action, _env, arg);
+ string _config_arg = arg;
+ /+ ↓ if first non-flag arg is a zip, extract for config discovery +/
+ if (arg.match(rgx_files.src_pth_zip)) {
+ auto _zpr = extractZipPod(arg);
+ if (_zpr.ok) {
+ _zip_pod_extractions ~= _zpr;
+ _config_arg = _zpr.pod_dir;
+ }
+ }
+ _manifested = PathMatters!()(_opt_action, _env, _config_arg);
{ /+ local site config +/
_conf_file_details = configFilePaths!()(_manifested, _env, _opt_action.config_path_set);
auto _config_local_site_struct = readConfigSite!()(_conf_file_details, _opt_action, _cfg);
@@ -1047,7 +1060,166 @@ string program_name = "spine";
_manifests ~= _manifested;
}
} else if (arg.match(rgx_files.src_pth_zip)) {
- // fns_src ~= arg; // gather input markup source file names for processing
+ /+ ↓ zip pod archive: extract to temp dir, process as pod +/
+ /+ check if this zip was already extracted during config discovery +/
+ string _zip_pod_dir;
+ foreach (ref _zpr; _zip_pod_extractions) {
+ if (_zpr.ok && _zpr.pod_dir.length > 0
+ && _zpr.pod_dir.baseName == arg.baseName.stripExtension)
+ {
+ _zip_pod_dir = _zpr.pod_dir;
+ break;
+ }
+ }
+ if (_zip_pod_dir.length == 0) {
+ auto _zpr = extractZipPod(arg);
+ if (!_zpr.ok) {
+ writeln("ERROR >> Processing Skipped! Zip extraction failed: ", arg, " - ", _zpr.error_msg);
+ } else {
+ _zip_pod_extractions ~= _zpr;
+ _zip_pod_dir = _zpr.pod_dir;
+ }
+ }
+ if (_zip_pod_dir.length > 0) {
+ /+ process extracted pod directory same as regular pod +/
+ auto _zip_manifest = PodManifest!()(_opt_action, _zip_pod_dir);
+ if (_zip_manifest.pod_manifest_file_with_path
+ && _opt_action.abstraction
+ ) {
+ string pod_manifest_root_content_paths_to_markup_location_raw_;
+ string markup_contents_location_;
+ string sisudoc_txt_ = _zip_manifest.pod_manifest_file_with_path;
+ enforce(
+ exists(sisudoc_txt_)!=0,
+ "file not found: <<" ~
+ sisudoc_txt_ ~ ">>"
+ );
+ if (exists(sisudoc_txt_)) {
+ try {
+ import dyaml;
+ Node pod_manifest_yaml;
+ try {
+ pod_manifest_yaml = Loader.fromFile(sisudoc_txt_).load();
+ } catch (ErrnoException ex) {
+ } catch (FileException ex) {
+ writeln("ERROR failed to read config file");
+ } catch (Throwable) {
+ writeln("ERROR failed to read config file content, not parsed as yaml");
+ }
+ if ("doc" in pod_manifest_yaml) {
+ if (pod_manifest_yaml["doc"].type.mapping
+ && pod_manifest_yaml["doc"].tag.match(rgx_y.yaml_tag_is_map)
+ ) {
+ if ("path" in pod_manifest_yaml["doc"]) {
+ if (pod_manifest_yaml["doc"]["path"].tag.match(rgx_y.yaml_tag_is_seq)) {
+ foreach (string _path; pod_manifest_yaml["doc"]["path"]) {
+ markup_contents_location_ ~= _path ~ "\n";
+ pod_manifest_root_content_paths_to_markup_location_raw_ ~=
+ _path ~ "\n";
+ }
+ } else if (
+ pod_manifest_yaml["doc"]["path"].type.string
+ && pod_manifest_yaml["doc"]["path"].tag.match(rgx_y.yaml_tag_is_str)
+ ) {
+ markup_contents_location_ = pod_manifest_yaml["doc"]["path"].get!string;
+ pod_manifest_root_content_paths_to_markup_location_raw_ =
+ pod_manifest_yaml["doc"]["path"].get!string;
+ }
+ }
+ if ("filename" in pod_manifest_yaml["doc"]) {
+ if (pod_manifest_yaml["doc"]["filename"].tag.match(rgx_y.yaml_tag_is_seq)) {
+ foreach (string _filename; pod_manifest_yaml["doc"]["filename"]) {
+ if ("language" in pod_manifest_yaml["doc"]) {
+ if (pod_manifest_yaml["doc"]["language"].tag.match(rgx_y.yaml_tag_is_seq)) {
+ foreach (string _lang; pod_manifest_yaml["doc"]["language"]) {
+ markup_contents_location_ ~=
+ "media/text/"
+ ~ _lang ~ "/"
+ ~ _filename ~ "\n";
+ }
+ } else if (pod_manifest_yaml["doc"]["language"].tag.match(rgx_y.yaml_tag_is_str)
+ ) {
+ markup_contents_location_ =
+ "media/text/"
+ ~ pod_manifest_yaml["doc"]["language"].get!string
+ ~ "/" ~ _filename ~ "\n";
+ } else {
+ string _lang_default = "en";
+ markup_contents_location_ ~=
+ "media/text/"
+ ~ _lang_default ~ "/"
+ ~ pod_manifest_yaml["doc"]["filename"].get!string ~ "\n";
+ }
+ } else {
+ string _lang_default = "en";
+ markup_contents_location_ ~=
+ "media/text/"
+ ~ _lang_default ~ "/"
+ ~ pod_manifest_yaml["doc"]["filename"].get!string ~ "\n";
+ }
+ }
+ } else if (
+ pod_manifest_yaml["doc"]["filename"].type.string
+ && pod_manifest_yaml["doc"]["filename"].tag.match(rgx_y.yaml_tag_is_str)
+ ) {
+ if ("language" in pod_manifest_yaml["doc"]) {
+ if (pod_manifest_yaml["doc"]["language"].tag.match(rgx_y.yaml_tag_is_seq)) {
+ foreach (string _lang; pod_manifest_yaml["doc"]["language"]) {
+ markup_contents_location_ ~=
+ "media/text/"
+ ~ _lang ~ "/"
+ ~ pod_manifest_yaml["doc"]["filename"].get!string ~ "\n";
+ }
+ } else if (pod_manifest_yaml["doc"]["language"].tag.match(rgx_y.yaml_tag_is_str)) {
+ markup_contents_location_ =
+ "media/text/"
+ ~ pod_manifest_yaml["doc"]["language"].get!string
+ ~ "/" ~ pod_manifest_yaml["doc"]["filename"].get!string ~ "\n";
+ } else {
+ string _lang_default = "en";
+ markup_contents_location_ ~=
+ "media/text/"
+ ~ _lang_default ~ "/"
+ ~ pod_manifest_yaml["doc"]["filename"].get!string ~ "\n";
+ }
+ } else {
+ string _lang_default = "en";
+ markup_contents_location_ ~=
+ "media/text/"
+ ~ _lang_default ~ "/"
+ ~ pod_manifest_yaml["doc"]["filename"].get!string ~ "\n";
+ }
+ }
+ }
+ }
+ }
+ } catch (ErrnoException ex) {
+ } catch (FileException ex) {
+ // Handle errors
+ }
+ } else {
+ writeln("manifest not found: ", sisudoc_txt_);
+ }
+ auto markup_contents_locations_arr
+ = (cast(char[]) markup_contents_location_).split;
+ auto tmp_dir_ = (sisudoc_txt_).dirName.array;
+ foreach (markup_contents_location; markup_contents_locations_arr) {
+ assert(markup_contents_location.match(rgx_files.src_pth_sst_or_ssm),
+ "not a recognised file: <<" ~
+ markup_contents_location ~ ">>"
+ );
+ auto markup_contents_location_pth_ = (markup_contents_location).to!string;
+ Regex!(char) lang_rgx_ = regex(r"/(" ~ _opt_action.languages_set.join("|") ~ ")/");
+ if (_opt_action.languages_set[0] == "all"
+ || (markup_contents_location_pth_).match(lang_rgx_)
+ ) {
+ auto _fns = (((tmp_dir_).chainPath(markup_contents_location_pth_)).array).to!string;
+ _manifested = PathMatters!()(_opt_action, _env, _zip_pod_dir, _fns, markup_contents_locations_arr);
+ _manifests ~= _manifested;
+ }
+ }
+ }
+ }
} else { // anything remaining, unused
arg_unrecognized ~= " " ~ arg;
}
@@ -1277,4 +1449,8 @@ string program_name = "spine";
}
}
} // else { writeln("NO METADATA CURATED"); }
+ /+ ↓ clean up any extracted zip pod temp directories +/
+ foreach (ref _zpr; _zip_pod_extractions) {
+ cleanupZipPod(_zpr);
+ }
}