Monochromization of PDF Files

Nov 7 2022 · LastMod: Nov 7 2022

Written for POSIX shells. Depends on:

  1. qpdf.
  2. parallel, GNU parallel for the full name. xargs can be used if desired but it would result in a painfully slow script.
  3. imagemagick for the convert command.
  4. Optional: ocrmypdf if OCR is turned on.

The output file will be generated in a directory under /tmp/ with a name of the form tmp.XXXXXXXXX.monochromize. The directory will not be deleted automatically.

The threshold value is out of 100. e.g. -e 1 -t 90 to turn all the pixel points, with transparency below 90%, black.

 1#!/bin/sh
 2
 3GREEN='\033[0;32m'
 4CYAN='\033[0;36m'
 5NC='\033[0m'
 6
 7test_page=23
 8density=300
 9threshold=50
10ocr=0
11lang=eng
12output=output.pdf
13
14test_threshold_arg=0
15
16show_help_msg() {
17echo $(cat <<EOF
18Usage: $CYAN${0##*/}$NC [OPTIONS] $GREEN-i$NC INPUT $GREEN-o$NC OUTPUT\n
19
20\t$GREEN-i$NC 	Input file name\n
21\t$GREEN-o$NC	Output file name\n\n
22\t$GREEN-l$NC	Set ocr language\n
23\t$GREEN-t$NC   Set threshold\n
24\t$GREEN-r$NC   Set OCR (0 off, 1 on, default off)\n\n
25\t$GREEN-e$NC   Test threshold (0 off, 1 on, default 0)\n
26\t$GREEN-h$NC	Show this help message\n
27EOF
28)
29exit 0
30}
31
32test_threshold() {
33if [ ! -e "$input" ]; then show_help_msg ; fi
34
35test_f=$(mktemp --suffix .test_threshold.input.pdf)
36output=$(mktemp --suffix .test_threshold.output.pdf)
37
38qpdf "$input" --pages . $test_page -- "$test_f"
39convert -density $density "$test_f" -threshold $threshold% -type bilevel -compress fax "$output"
40
41zathura "$output"
42
43trap "kill $! 2>/dev/null; rm -f "$test_f" "$output"" EXIT
44
45exit 0
46}
47
48start_monochromize() {
49working_dir=$(mktemp -d --suffix .monochromize)
50echo Working Directory $working_dir
51mkdir $working_dir/split
52mkdir $working_dir/comp
53qpdf --split-pages "$input" $working_dir/split/out.pdf
54local page_count=$(ls $working_dir/split | wc -l)
55seq -w 1 $page_count | parallel convert -density $density $working_dir/split/out-{}.pdf -threshold $threshold% -type bilevel -compress fax $working_dir/comp/comp-{}.pdf
56qpdf --empty --pages $working_dir/comp/comp* -- $working_dir/"$output"
57echo Output $working_dir/$output
58
59if [ "$ocr" -eq 1 ]; then ocrmypdf -l $lang --optimize 3 --skip-text $working_dir/"$output" $working_dir/"${output}_ocr.pdf" ; fi
60}
61
62
63while getopts ':t:i:o:r:l:p:e:h:' opt; do
64        case "$opt" in
65                t) threshold="$OPTARG" ;;
66                i) input="$OPTARG" ;;
67                o) output="$OPTARG" ;;
68                r) ocr=$OPTARG ;;
69                l) lang="$OPTARG" ;;
70                p) test_page=$OPTARG ;;
71                e) test_threshold_arg=$OPTARG ;;
72        esac
73done
74
75if [ "$#" -eq 0 ]; then show_help_msg ; fi
76if [ ! -e "$input" ]; then show_help_msg ; fi
77if [ "$test_threshold_arg" -eq 1 ]; then test_threshold ; fi
78
79start_monochromize