#! /usr/bin/perl -w # This program is free software. It comes without any warranty, to the # extent permitted by applicable law. You can redistribute it and/or # modify it under the terms of the Do What The Fuck You Want To Public # License, Version 2, as published by Sam Hocevar. # See http://sam.zoy.org/wtfpl/COPYING for more details. $VERSION = "0.4, 12 May 2011"; use Getopt::Long qw(:config posix_default no_ignore_case gnu_compat bundling auto_version auto_help); use Pod::Usage; use IPC::Open3; use POSIX qw(floor); use Error qw(:try); use strict; =head1 NAME pdftool.pl - a PDF swiss army knife =head1 SYNOPSIS B [B<-w> I ] [B<-h> I] [B<-p> I] [B<-W> I] [B<-H> I] [B<-P> I] [B<-s> I] [B<-m> I] [B<-b> I] [B<-c>] [B<--book>] [B<--column>] [B<-n> I] [B<--screen>] [B<-q>] [I [I]] =head1 DESCRIPTION B combines the tools in the PSUtils bundle in a nice way. The input should be either a Portable Document Format (PDF) file, or a PostScript file. The output format is PDF only. If no input file is given, or if a single hyphen-minus (I<->) is given as file name, B will read the PDF or PostScript data from the standard input. In that case, and if the input data is in PDF format, an auxiliary file will be created (since the conversion from PDF to PS requires random access to the data), and removed afterwards. Also, if the crop option (B<-c>) is set, an auxiliary file will be created, and removed afterwards. If no output file is given, or if a single hyphen-minus (I<->) is given as file name, B will send the data to the standard output. By default, B rotates the pages in order to ensure that your pdf will be printable using your favorite duplex mode for portrait documents (Tumble if you prefer to turn the pages like those of a book). See the B<--screen> option to bypass this behavior. The document will be treated as follows: =over 4 =item * Convert from PDF to PostScript (if necessary, and if possible - that is if all page numbers are relative to the begining of the document -, convert only the smallest interval that contains all the selected pages), =item * Select the page range, =item * Calculate the minimal bounding box =item * Rearrange pages for printing books or booklets, =item * Put multiple pages per sheets, =item * Convert back from PS to PDF. =back =head1 OPTIONS =over 8 =item B<-s> I, B<--select=>I Specify the pages which are to be selected. I is a comma separated list of page ranges, each of which may be a page number, or a page range of the form I-I. If I is omitted, the first page is assumed, and if I is omitted, the last page is assumed. The prefix character `_' indicates that the page number is relative to the end of the document, counting backwards. If just this character with no page number is used, a blank page will be inserted. =item B<-w> I, B<--width=>I Specify the width of the output file. If the height is not specified as well, it will be ignored. The known units are I, I, I and I. The default unit is I. =item B<-h> I, B<--height=>I Specify the height of the output file. If the width is not specified as well, it will be ignored. The known units are I, I, I and I. The default unit is I. =item B<-p> I, B<--paper=>I Specify the paper size of the output file, as an alternative to B<-w> and B<-h>. Can be set to I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, or I<10x14>. The default output paper size is I. =item B<-W> I, B<--Width=>I Same as the option B<-w>, but for the input file. This option is ignored if the crop option (B<-c>) is set. =item B<-H> I, B<--Height=>I Same as the option B<-h>, but for the input file. This option is ignored if the crop option (B<-c>) is set. =item B<-P> I, B<--Paper=>I Same as the option B<-p>, but for the input file. By default, B will try to guess this value from the header of the file, and will fail if the information is missing. This option is ignored if the crop option (B<-c>) is set. =item B<-b> I, B<--border=>I Add a margin around each logical page on a sheet. Possible units are I, I, I and I. The default unit is I. The default border is I<1cm> if the crop option (B<-c>) is set, and I<0> otherwise. =item B<-m> I, B<--margin=>I Add a margin around the whole page. Possible units are I, I, I and I. The default unit is I. The default margin is I<0>. =item B<-c>, B<--crop> If this option is set, the PostScript code will interpreted to calculate the maximal effective bounding box. This operation may take time and be quite demanding for the CPU. See the note for the border option (B<-b>) above. =item B<--book> Rearrange pages for printing books or booklets. If your "default" duplex mode (see B) is "NoTumble", you should either use lpr with the option "Tumble" manually, or consider the B option B<--screen> instead. =item B<-n> I, B<--nup=>I Put multiple logical pages onto each physical sheet of paper. If I is less than 10, the option B<->I may be used as an alternative. =item B<--screen> By default, B ensures that your pdf will be printable using your "default" duplex mode (see B). B<--screen> tries to make the output PDF ready to read on your computer instead. It has no effect for portrait documents. =item B<--column> Change the order to "column-major", where successive pages are placed in columns down the paper. =item B<-q>, B<--quiet> B normally prints the page numbers of the pages output; this option suppresses this. =item B<--help> Display a brief help. =item B<--version> Display the version number of the B program. =item B<--man> Display the manual page. =back =head1 EXAMPLES The following comand can be used to remotely crop a PDF file, convert it to A4 paper, and rearrange the pages to print a booklet with custom margins: C<< ssh remote pdftool.pl -cpA4 --book -2 -b2cm -m-1cm < in.pdf | \ lpr -o Duplex=DumplexTumble >> =head1 REQUIREMENTS Requires PSUtils installed and available via the command line http://www.tardis.ed.ac.uk/~ajcd/psutils/. =head1 AUTHOR Copyright 2010-2011 Guilhem Moulin. See the source for copying conditions. =cut my $tmpdir = '/tmp'; # # Options & arguments # my $select; my ($outwidth,$outheight, $inwidth,$inheight); my ($margin, $border); my $crop; my $book; my $nup = 1; my $column; my $quiet; my $man; my $screen; GetOptions( "select|s=s" => \$select, "w|width=s" => \$outwidth, "h|height=s" => \$outheight, "p|paper=s" => sub { &papersize ($_[1],\$outwidth,\$outheight) }, "W|Width=s" => \$inwidth, "H|Height=s" => \$inheight, "P|Paper=s" => sub { &papersize ($_[1],\$inwidth,\$inheight) }, "margin|m=s" => \$margin, "border|b=s" => \$border, "crop|c" => \$crop, "book" => \$book, "nup|n=i" => \$nup, "screen" => \$screen, "1" => sub { $nup = 1 }, "2" => sub { $nup = 2 }, "3" => sub { $nup = 3 }, "4" => sub { $nup = 4 }, "5" => sub { $nup = 5 }, "6" => sub { $nup = 6 }, "7" => sub { $nup = 7 }, "8" => sub { $nup = 8 }, "9" => sub { $nup = 9 }, "column" => \$column, "q|quiet" => \$quiet, "man" => \$man ) or pod2usage(2); pod2usage(2) if ($#ARGV > 1); pod2usage(-exitstatus => 0, -verbose => 2) if defined $man; # Input and output files my ($infile, $outfile) = @ARGV; # # Default values # # Default margin & border $margin = 0 unless defined $margin; unless (defined $border) { if (defined $crop) { $border = '1cm'; } else { $border = 0; } } # Default output papersize &papersize ("a4", \$outwidth, \$outheight) unless (defined $outwidth and defined $outheight); # Default unit: PostScript point map {&topoints ($_)} ( \$outwidth, \$outheight, \$inwidth, \$inheight, \$margin, \$border ); die "Margins are too big" if $outwidth <= $margin*2 or $outheight <= $margin*2; # # Check options # die "Bad page range: `$select'" if defined $select && not $select =~ /^(_?\d*-?_?\d*,)*_?\d*-?_?\d*$/; die "Bad nup: `$nup'" if defined $nup && not ($nup =~ /^\d+$/ && $nup > 0); # # Open input and output files # my ($FIN, $FOUT); if (defined $infile && $infile ne "-") { open $FIN, '<', $infile or die "Can't read `$infile': $!"; } else { undef $infile; $FIN = *STDIN; } if (defined $outfile && $outfile ne "-") { open $FOUT, '>', "$outfile" or die "Can't create `$outfile': $!"; } else { $FOUT = *STDOUT; } *LOG = *STDERR; # Auxiliary files, to remove my @auxfiles; # Pids, to waid for my @pids; # Return value my $return = 0; try { my $FD = &pdftops($FIN); $FD = &psselect($FD) if defined $select; my ($FD2, @bbox) = &psbbox($FD); $FD2 = &psbook($FD2) if defined $book; my ($FD3, $landscape, $rotate) = &psnup ($FD2, @bbox); &pstopdf ($FD3, $FOUT, $landscape, $rotate); } catch Error with { # Print the error message print LOG shift; # Kill all the running childrens kill 15, map {$$_[0]} @pids; $return = 1; } finally { # Avoid zombies map { my ($pid, @cmd) = @$_; my ($r,$v) = (waitpid ($pid, 0), $?); warn "Can't run `" . &printcmd (@cmd) . "'" if ($r != -1 and $v >> 8); } @pids; # Close opened file handles map { close $_ or die "Can't close: $!" } ( $FIN, $FOUT ); # Delete auxiliary files unlink @auxfiles; exit $return; }; # Useless, but Perl doesn't see that this filehandle is used more than # one time close IN; # automatically closed by `open3' close OUT; # ========================================================= # # Conversion from PDF to PS, if necessary # sub pdftops { my $IN = $_[0]; my $OUT; # # Detect filetype # # To avoid to seek into IN, it gonna be copied from WRITE to READ in # the background, once the filetype has been read # # TODO: read specifications, to properly detect the filetype my $filetype; my ($READ, $WRITE); pipe $READ, $WRITE or die "Can't pipe: $!"; while (not (defined $filetype) && defined (my $l = <$IN>)) { print $WRITE ($l) or die "Can't close: $!"; if (defined $l && $l =~ /^%!PS/) { $filetype = "PS"; } elsif (defined $l && $l =~ /^%PDF/) { $filetype = "PDF"; } } die "Can't recognize the filetype" unless defined $filetype; unless (my $pid = fork) { # Child: cat $IN > $WRITE in background die "Can't fork: $!" unless defined $pid; close $READ or die "Can't close: $!"; while (<$IN>) { print $WRITE ($_) or die "Can't print: $!"; } exit; } # Parent close $WRITE or die "Can't close: $!"; return $READ if $filetype eq "PS"; # # Conversion from PDF to PS # unless (defined $infile) { # Need to copy the whole input to an auxiliary file, since # conversion from PDF to PS requires random access to the data $infile = "$tmpdir/pdftool-stdin-$$." . lc $filetype; open my $AUX, '>', $infile or die "Can't write into `$infile': $!"; push @auxfiles, $infile; # cat > $infile while (<$READ>) { print $AUX ($_) or die "Can't print: $!"; } close $AUX; } my ($first, $last); # pdftops doesn't provide any way to have page numbers relative to # the end of the document, hence there is no detection of the # smallest interval if $select contains `_' if (defined $select && not $select =~ /_/) { # Convert to PS only the pages we are interested in ($first, $last) = (1<<16,-(1<<16)); for (split /,/, $select) { $_ =~ /^(\d*)(-?)(\d*)$/; my ($rmin,$sep,$rmax) = ($1,$2,$3); undef $first if $sep && not $rmin; undef $last if $sep && not $rmax; if ($rmin) { $first = $rmin if defined $first && $rmin < $first; $last = $rmin if defined $last && $rmin > $last; } if ($rmax) { $first = $rmax if defined $first && $rmax < $first; $last = $rmax if defined $last && $rmax > $last; } } # Calculate the new page range my @newselect; for (split /,/, $select) { $_ =~ /^(\d*)(-?)(\d*)$/; my ($rmin,$sep,$rmax) = ($1,$2,$3); if (defined $first) { $rmin -= $first-1 if $rmin; $rmax -= $first-1 if $rmax; } push @newselect, "$rmin$sep$rmax"; } $select = join ',', @newselect; } # Convert to PS my @cmd = ('pdftops', '-origpagesizes', $infile, '-'); push @cmd, '-f', $first if defined $first; push @cmd, '-l', $last if defined $last; push @cmd, '-q' if defined $quiet; my $pid = open $OUT, "-|", @cmd or die "Can't run `" . &printcmd (@cmd) . "'"; push @pids, [$pid, @cmd]; return $OUT; } # # Select some pages in the document # sub psselect { my $IN = $_[0]; my $OUT; my @cmd = ('psselect', '-p', $select); push @cmd, '-q' if defined $quiet; *IN = $IN; my $pid = open3 '<&IN', $OUT, '>&LOG', @cmd; push @pids, [$pid, @cmd]; return $OUT; } # # Detect / calculate the bounding box # sub psbbox { my $IN = $_[0]; my ($OUT, @bbox); if (defined $crop) { # Calculate the maximal bounding box unless (seek $IN, 0, 1) { # The input is not seekable: have to create a seekable auxiliary # file my $auxfile = "$tmpdir/pdftool-stdin-$$.ps"; open my $AUX, '>', $auxfile or die "Can't write into `$auxfile': $!"; push @auxfiles, $auxfile; # cat > $auxfile while (<$IN>) { print $AUX ($_) or die "Can't print: $!"; } close $AUX or die "Can't close: $!"; close $IN or die "Can't close: $!"; open $IN, '<', $auxfile or die "Can't read `$auxfile': $!"; } # Need to duplicate IN, since it will be closed in the parent process open *IN, '<&=', $IN or die "Can't fdopen: $!"; my @cmd = ('gs', '-sDEVICE=bbox', '-dBATCH', '-dNOPAUSE', '-'); my $pid = open3 "<&IN", ">&OUT", *OUT, @cmd; my ($p,$c) = (0,0); # Page & character counter my ($x0, $y0, $x1, $y1) = (1<<16, 1<<16, -(1<<16), -(1<<16)); while () { if ($_ =~ m/^\%\%BoundingBox: (\d+) (\d+) (\d+) (\d+)/) { $x0 = $1 if $1 < $x0; $y0 = $2 if $2 < $y0; $x1 = $3 if $3 > $x1; $y1 = $4 if $4 > $y1; unless (defined $quiet) { my $s = "[" . ++$p . "] "; $c += length $s; if ($c >= 80) { print LOG "\n" or die "Can't print: $!"; $c = length $s; } print LOG $s or die "Can't print: $!"; } } } close OUT or die "Can't close: $!"; print LOG "\n" or die "Can't print: $!" unless defined $quiet; # No zombie processes waitpid $pid, 0; die "Can't run `" . &printcmd (@cmd) . "'" if $? >> 8; die "Error while calculating bounding box" if ($x0 >= $x1 || $y0 >= $y1); @bbox = ($x0, $y0, $x1, $y1); # Let's go back to the beginning of the input seek $IN, 0, 0 or die "Can't seek: $!"; $OUT = $IN; } elsif (defined $inwidth and defined $inheight) { @bbox = (0, 0, $inwidth, $inheight); $OUT = $IN; } else { # Guess page size from the input file # To avoid to seek into IN, it gonna be copied from WRITE to READ # in background, once the Bounding Box has been read my ($READ, $WRITE); pipe $READ, $WRITE or die "Can't pipe: $!"; while (not (@bbox) && defined (my $l = <$IN>)) { print $WRITE ($l) or die "Can't print: $!"; @bbox = ($1, $2, $3, $4) if ($l =~ m/^\%\%BoundingBox: (\d+) (\d+) (\d+) (\d+)/); } die "Cannot guess input page size" unless @bbox; unless (my $pid = fork) { # Child: cat IN > WRITE in background die "Can't fork: $!" unless defined $pid; close $READ or die "Can't close: $!";; while (<$IN>) { print $WRITE ($_) or die "Can't print: $!"; } exit; } # Parent close $WRITE or die "Can't close: $!"; $OUT = $READ; } return ($OUT, @bbox); } # # PSBook # sub psbook { my $IN = $_[0]; my $OUT; my @cmd = ('psbook'); push @cmd, '-q' if defined $quiet; *IN = $IN; my $pid = open3 "<&IN", $OUT, ">&LOG", @cmd; push @pids, [$pid, @cmd]; return $OUT; } # # PSNup (inlined here, to keep track of the possible rotation) # sub psnup { my ($IN, @bbox) = @_; my ($OUT, $landscape, $rotate); if ((($bbox[2]-$bbox[0] > $bbox[3]-$bbox[1]) and not ($outwidth-2*$margin > $outheight-2*$margin)) or (defined $screen and not ($bbox[2]-$bbox[0] > $bbox[3]-$bbox[1]) and ($outwidth-2*$margin > $outheight-2*$margin))) { ($outheight, $outwidth) = ($outwidth, $outheight); $landscape = 1; } # # Find the best layout is an optimisation problem. We try all of the # combinations of width*height in both normal and rotated form, and # minimise the wasted space. # my ($ow,$oh) = ($outwidth-2*$margin, $outheight-2*$margin); my ($iw, $ih) = ($bbox[2]-$bbox[0], $bbox[3]-$bbox[1]); my ($horiz, $vert, $scale, $hshift, $vshift); my $tolerance = 100000; # layout tolerance my $best = $tolerance; for (my $hor = 1; $hor; $hor = &nextdiv($hor, $nup)) { my $ver = $nup / $hor; # try normal orientation first my $scl = &min ($oh/($ih*$ver), $ow/($iw*$hor)); my $optim = ($ow-$scl*$iw*$hor) * ($ow-$scl*$iw*$hor) + ($oh-$scl*$ih*$ver) * ($oh-$scl*$ih*$ver); if ($optim < $best) { $best = $optim; # recalculate scale to allow for internal borders $scale = &min (($oh-2*$border*$ver)/($ih*$ver), ($ow-2*$border*$hor)/($iw*$hor)); $hshift = ($ow/$hor - ($bbox[2]+$bbox[0])*$scale)/2; $vshift = ($oh/$ver - ($bbox[3]+$bbox[1])*$scale)/2; ($horiz, $vert) = ($hor, $ver); $rotate = 0; } # try rotated orientation $scl = &min ($oh/($iw*$hor), $ow/($ih*$ver)); $optim = ($oh-$scl*$iw*$hor) * ($oh-$scl*$iw*$hor) + ($ow-$scl*$ih*$ver) * ($ow-$scl*$ih*$ver); if ($optim < $best) { $best = $optim; # recalculate scale to allow for internal borders $scale = &min (($oh-2*$border*$hor)/($iw*$hor), ($ow-2*$border*$ver)/($ih*$ver)); $hshift = ($ow/$ver - ($bbox[3]+$bbox[1])*$scale)/2; $vshift = ($oh/$hor - ($bbox[2]+$bbox[0])*$scale)/2; ($horiz, $vert) = ($ver, $hor); $rotate = 3; } } # Fail if nothing better than worst tolerance was found die "Can't find acceptable layout for $nup-up" if $best == $tolerance; # # Construct pstops specification list # my $n = $horiz * $vert; my (@ospecs, @especs); # specs for odd and even pages for (my $pageno = 0; $pageno < $n; $pageno++) { my ($up, $across); # pageno index my ($orot,$erot) = ('','U'); my ($xoff, $yoff); if ($rotate) { if (defined $column) { # column=0; leftright=1; topbottom=0; $across = $pageno % $horiz; $up = floor ($pageno / $horiz); } else { # column=1; leftright=1; topbottom=0; $across = floor($pageno / $vert); $up = $pageno % $vert; } ($orot,$erot) = ('L','R'); $xoff = ($across+1)*$ow/$horiz - $hshift; } else { if (defined $column) { # column=1; leftright=1; topbottom=1; $across = floor($pageno / $vert); $up = $vert-1 - floor($pageno % $vert); } else { # column=0; leftright=1; topbottom=1; $across = $pageno % $horiz; $up = $vert-1 - floor($pageno / $horiz); } $xoff = $across*$ow/$horiz + $hshift; } $yoff = $up*$oh/$vert + $vshift; push @ospecs, sprintf ("%d%s@%.3f(%.3f,%.3f)", $pageno, $orot, $scale, $xoff+$margin, $yoff+$margin); push @especs, sprintf ("%d%s@%.3f(%.3f,%.3f)", $n + $pageno, $erot, $scale, $ow-$xoff+$margin, $oh-$yoff+$margin); } ($ow,$oh) = ($oh,$ow) if $rotate%2; my $pagespecs; if (defined $screen || $ow < $oh) { $pagespecs = $nup . ':' . join ('+', @ospecs); } else { $pagespecs = 2*$nup . ':' . join ('+', @ospecs) . ',' . join ('+', @especs); } my @cmd = ('pstops', '-w', $bbox[2], '-h', $bbox[3], $pagespecs); push @cmd, '-q' if defined $quiet; *IN = $IN; my $pid = open3 "<&IN", $OUT, ">&LOG", @cmd; push @pids, [$pid, @cmd]; return ($OUT, $landscape, $rotate); } # # Final file: Convert back to PDF # sub pstopdf { my ($IN, $OUT, $landscape, $rotate) = @_; my ($ow,$oh) = ($outwidth,$outheight); ($ow,$oh) = ($oh,$ow) if $rotate%2; my $pagedevice; if (defined $screen || $oh < $oh || $landscape) { $rotate = ($rotate+1)%4 if not (defined $screen) and $oh < $ow; $pagedevice = "/Orientation $rotate /PageSize [$outwidth $outheight]"; } else { $pagedevice = "/PageSize [$outwidth $outheight]"; } my @cmd = ('gs', "-sDEVICE=pdfwrite", "-sOutputFile=%stdout%", "-dBATCH", "-dNOPAUSE", "-dAutoRotatePages=/None", "-c", "<< $pagedevice >> setpagedevice", "-f", "-"); (*IN,*OUT) = ($IN,$OUT); my $pid = open3 "<&IN", ">&OUT", ">&LOG", @cmd; push @pids, [$pid, @cmd]; } # # In-place convert the given length to PostScript points # sub topoints { my $l = $_[0]; return unless defined $$l; $$l =~ /^([+-]?\d*\.?\d+)(\w*)$/ or die "Unable to parse `$$l'"; my $r = $1; if ($2 eq "" or $2 eq "pt") { # nothing } elsif ($2 eq "in") { $r *= 72; } elsif ($2 eq "cm") { $r *= 72/2.54; } elsif ($2 eq "mm") { $r *= 72/25.4; } else { die "Unknown unit: `$2'"; } $$l = floor ($r + .5); } # # In-place set the given width and height to the predefined papersize # sub papersize { my ($p,$w,$h) = @_; $p = lc $p; if ($p eq "a0") { ($$w,$$h) = (2384, 3370); # 84.1cm * 118.9cm } elsif ($p eq "a1") { ($$w,$$h) = (1684, 2384); # 59.4cm * 84.1cm } elsif ($p eq "a2") { ($$w,$$h) = (1191, 1684); # 42cm * 59.4cm } elsif ($p eq "a3") { ($$w,$$h) = (842, 1191); # 29.7cm * 42cm } elsif ($p eq "a4") { ($$w,$$h) = (595, 842); # 21cm * 29.7cm } elsif ($p eq "a5") { ($$w,$$h) = (421, 595); # 14.85cm * 21cm } elsif ($p eq "b5") { ($$w,$$h) = (516, 729); # 18.2cm * 25.72cm } elsif ($p eq "letter") { ($$w,$$h) = (612, 792); # 8.5in * 11in } elsif ($p eq "legal") { ($$w,$$h) = (612, 1008); # 8.5in * 14in } elsif ($p eq "ledger") { ($$w,$$h) = (1224, 792); # 17in * 11in } elsif ($p eq "tabloid") { ($$w,$$h) = (792, 1224); # 11in * 17in } elsif ($p eq "statement") { ($$w,$$h) = (396, 612); # 5.5in * 8.5in } elsif ($p eq "executive") { ($$w,$$h) = (540, 720); # 7.6in * 10in } elsif ($p eq "folio") { ($$w,$$h) = (612, 936); # 8.5in * 13in } elsif ($p eq "quarto") { ($$w,$$h) = (610, 780); # 8.5in * 10.83in } elsif ($p eq "10x14") { ($$w,$$h) = ("10in", "14in"); } else { die "Unknown paper size: `$p'"; } } # # Print a command just like you'd do in a shell # sub printcmd { my @cmd; for (@_) { my $s = $_; $s =~ s/"/\\"/; $s = "\"$s\"" if $s =~ /[ ()';#{}*?~&|`!]/; push @cmd, $s; } join ' ', @cmd; } sub nextdiv { my ($n, $m) = @_; while (++$n <= $m) { return $n if ($m % $n == 0) } return 0; } sub min { my ($n, $m) = @_; return $n if $n < $m; return $m; }