scripts

author: Mike Vink <ivi@vinkies.net> 2025-10-23 20:46:21 +0200
committer: Mike Vink <ivi@vinkies.net> 2025-10-23 20:46:21 +0200
commit: 4acdda4eff137071be4d7cb6293ecaf0cb2dd8d3 (patch)
tree: 1580b5db2a4ecc05b8825dadf0b18eb08d8c2dc4 /.local
parent: 29a759d2c7614cd4bea6f7f9b656120f87f60d33 (diff)
1 files changed, 264 insertions, 0 deletions
diff --git a/.local/bin/memo b/.local/bin/memo
new file mode 100755
index 0000000..2a05c80
--- /dev/null
+++ b/.local/bin/memo
@@ -0,0 +1,264 @@
+#!/usr/bin/env bash
+#
+# memo(1), memoizes the output of your command-line, so you can do:
+#
+#  $ memo <some long running command> | ...
+#
+# Instead of
+#
+#  $ <some long running command> > tmpfile
+#  $ cat tmpfile | ...
+#  $ rm tmpfile
+#
+# You can even use it in the middle of a pipe if you know that the input is not
+# extremely long. Just supply the -s switch:
+#
+#  $ cat sitelist | memo -s parallel curl | grep "server:"
+#
+# As long as "sitelist" isn't changed, the curl(1) invocation will not be rerun.
+#
+# Memo provides some nice to haves, like transparant (de)compressing of the
+# output so you're less likely to be disk I/O bound and save some space. The
+# order of preferred compression algorithms is: lz4, xz, gzip. If none of these
+# programs are present on the system, the output is stored uncompressed.
+#
+# If you think this is handy, you're right. Yet, there are good reasons why this
+# sort of tool isn't distributed by default on distributions: it can give
+# surprising effects. Suppose the <command> you're trying to memoize depends on
+# the current working directory. Changing the directory and running the same
+# memo invocation will now paste the wrong data to stdout. Use this only if you
+# know what it's doing.
+#
+# That said, I use it a lot, I really can't be bothered to create arbitrarily
+# named temporary files all the time when I'm grabbing output from some slow
+# networked program. Additionally, memo transparantly compresses.
+#
+# If you want to memo'ize shell functions, then you can source this script
+# instead of executing it. This will define a function called memo() that you
+# can use in exactly the same way. It made the implementation much uglier, but
+# at least I can memo'ize functions.
+#
+# Don't let the /bin/bash shebang at the top fool you, it runs just fine in zsh.
+
+# Detect being sourced from zsh and bash.
+__memo_sourced=0
+([[ -n $ZSH_EVAL_CONTEXT && $ZSH_EVAL_CONTEXT =~ :file$ ]] ||
+ [[ -n $KSH_VERSION && $(cd "$(dirname -- "$0")" &&
+    printf '%s' "${PWD%/}/")$(basename -- "$0") != "${.sh.file}" ]] ||
+ [[ -n $BASH_VERSION && $0 != "$BASH_SOURCE" ]]) && __memo_sourced=1
+
+memo() {
+  (
+  set -euo pipefail
+
+  usage() {
+      echo "SYNOPSIS"
+      echo "  memo [-chs] <command>"
+      echo ""
+      echo "DESCRIPTION"
+      echo "  Memoizes the output of <command> and outputs it."
+      echo ""
+      echo "  NOTE: If an interrupt happens while memoizing, it is cleared."
+      echo ""
+      echo "OPTIONS"
+      echo "  -c  Clean the cache, if no command is specified, clean everything."
+      echo "  -h  Usage message."
+      echo "  -s  Take stdin as an input to the command. Note, that this"
+      echo "      requires buffering up all the input before memo is able to"
+      echo "      decide whether to replay the cached version or not."
+      return $1
+  }
+
+  # Process options.
+  local opt_clear=0
+  local opt_stdin=0
+  while getopts :chs opt ; do
+    case $opt in
+      c) opt_clear=1 ;;
+      h) usage 0 ;;
+      s) opt_stdin=1 ;;
+      ?) (( --OPTIND )) ; break ;;
+    esac
+  done
+  shift $(( OPTIND - 1 ))
+
+  (( $# > 0 )) || usage 1
+
+  # Ensures that the memodir for $USER exists. If it doesn't exist, tries to
+  # create it with the right permissions.
+  ensuredir() {
+    local dir="${TMPDIR:-/tmp}/memo"
+    [[ -d "$dir" ]] || mkdir -p -m 0777 "$dir"
+    dir="$dir/$USER"
+    [[ -d "$dir" ]] || mkdir -m 0700 "$dir"
+    echo -n "$dir"
+  }
+
+  # Generate a sha512, done this way to account for differences between most
+  # Linux distros and OSX.
+  genhash() {
+    hash sha512sum 2>/dev/null && sha512sum || shasum -a 512
+  }
+
+  # Generates a hashed path inside of the memodir. The hash is based on the
+  # stdin of this function.
+  path() {
+    # Output the first part
+    ensuredir
+    # Output a separator
+    echo -n "/"
+    # Finally, output the hashed arguments and use it as the filename The shasum
+    # family of utilities tend to output the hashes in hex format, so no need to
+    # fear strange characters being output.
+    genhash | cut -d' ' -f1
+  }
+
+  # Echo memo's preferred compressor, must take input on stdin and output
+  # a compressed stream on stdout. Replaces current shell.
+  compressor() {
+    if hash zstd 2>/dev/null ; then exec zstd
+    elif hash lz4 2>/dev/null ; then exec lz4
+    elif hash xz 2>/dev/null ; then exec xz
+    elif hash gzip 2>/dev/null ; then exec gzip
+    fi
+    exec cat -
+  }
+
+  # Same as compressor, but in reverse. Keep these two in sync. Replaces current
+  # shell.
+  decompressor() {
+    if hash zstd 2>/dev/null ; then exec zstd -dc
+    elif hash lz4 2>/dev/null ; then exec lz4 -dc
+    elif hash xz 2>/dev/null ; then exec xz -dc
+    elif hash gzip 2>/dev/null ; then exec gzip -dc
+    fi
+    exec cat -
+  }
+
+  # Cats a file, decompressing if necessary. The filename is the first argument,
+  # and it will get a compression extension (.gz, ...) applied for every
+  # available decompressor to see if the file exists.
+  catfile() {
+    if [ -f "$1.zst" ] && hash zstd 2>/dev/null ; then
+      zstd -dc < "$1.zst" || true
+    elif [ -f "$1.lz4" ] && hash lz4 2>/dev/null ; then
+      lz4 -dc < "$1.lz4" || true
+    elif [ -f "$1.xz" ] && hash xz 2>/dev/null ; then
+      xz -dc < "$1.xz" || true
+    elif [ -f "$1.gz" ] && hash gzip 2>/dev/null ; then
+      gzip -dc < "$1.gz" || true
+    elif [ -f "$1" ] ; then
+      cat "$1" || true
+    else
+      # Cache not found, sad face.
+      return 1
+    fi
+  }
+
+  # Compress standard input to $1.<ext> (.ext is based on the best compression
+  # program found). Calling this function will replace the current shell with
+  # the decompressor, so make sure this is a terminal statement.
+  compressfile() {
+    if hash zstd 2>/dev/null ; then
+      exec zstd > "$1.zst"
+    elif hash lz4 2>/dev/null ; then
+      exec lz4 > "$1.lz4"
+    elif hash xz 2>/dev/null ; then
+      exec xz > "$1.xz"
+    elif hash gzip 2>/dev/null ; then
+      exec gzip > "$1.gz"
+    else
+      exec cat - > "$1"
+    fi
+  }
+
+  # Blast the entire folder if -c was passed and nothing more.
+  if (( opt_clear )) && (( $# == 0 )) ; then
+    rm -rf "$(ensuredir)"
+  else
+    # If stdin needs to be part of the unique command fingerprint, we need to
+    # save it somewhere so we can first hash it and then check if we already
+    # have a cache. Let's create a temporary file.
+    local stdinsave
+    if (( opt_stdin )) ; then
+      # Create a temporary file and schedule its cleanup.
+      stdinsave=$(mktemp)
+      cleanup() {
+        rm "$stdinsave"
+        trap - EXIT # Clean EXIT trap.
+      }
+
+      # Cleanup file after either EXIT (if run as a standalone script) or
+      # RETURN (if run as a sourced bash function). zsh seems to also fire
+      # the EXIT trap when a sourced function returns.
+      #
+      # Check if $ZSH_VERSION is set in an antique way because macOS is
+      # stuck on pre 4.2 bash, so it doesn't know about -v (and we're in set
+      # -u mode, so most other checks will fail).
+      if [[ -z "${ZSH_VERSION-}" ]] ; then
+        trap cleanup EXIT RETURN
+      else
+        trap cleanup EXIT
+      fi
+    fi
+
+    # In the following block we construct the unique path to save the output of
+    # the command in.
+    local file
+    file=$({
+      # The command and its arguments are always part of the hash.
+      echo -n "$@"
+
+      # Save the input while compressing it and pass it through with tee(1).
+      if (( opt_stdin )) ; then
+        tee >(compressor > "$stdinsave")
+      fi
+    } | path)
+
+    if (( opt_clear )) ; then
+      # Remove the saved content(s) of this hash if -c was passed.
+      find "$(dirname "$file")" -type f -name "$(basename "$file")*" -delete
+    else
+      # Output the cached file or generate a new one.
+      catfile "$file" || {
+        # Clear the cache and forward the return/signal code.
+        nocache() {
+          # $? represents the signal on all handlers except for ERR, where it
+          # represents the process error code. Note to self: find a clever way
+          # to deal with this.
+          declare -ri ret=$?
+          trap - INT # Clear traps.
+          rm "$file"* 2>/dev/null || true
+          exit $ret
+        }
+
+        # WARNING: if you're ever thinking of adding more signals to clear the
+        # cache on, read the following:
+        #
+        # When adding the 'ERR' signal to the set, any 'INT' signal received
+        # will trigger the specified function twice (from the same process, see
+        # the commit comments). I cannot explain why this is the case, but it's
+        # so. To try it out: trap nocache INT ERR, then make the process
+        # interrupt by pressing ctrl-c.
+        trap nocache INT
+
+        if (( opt_stdin )) ; then
+          # It appears we don't have a cached file, but we already absorbed
+          # stdin, so we need to produce it again.
+          exec < <(decompressor < "$stdinsave")
+        fi
+        "$@" | tee >(compressfile "$file")
+      }
+    fi
+  fi
+  )
+}
+
+(( __memo_sourced )) && {
+  unset __memo_sourced
+  return 0
+}
+
+# If we reach here, we're not being sourced.
+set -euo pipefail
+memo "$@"
author	Mike Vink <ivi@vinkies.net>	2025-10-23 20:46:21 +0200
committer	Mike Vink <ivi@vinkies.net>	2025-10-23 20:46:21 +0200
commit	4acdda4eff137071be4d7cb6293ecaf0cb2dd8d3 (patch)
tree	1580b5db2a4ecc05b8825dadf0b18eb08d8c2dc4 /.local
parent	29a759d2c7614cd4bea6f7f9b656120f87f60d33 (diff)