From c205ca2d1573cab9fb0013c3c186ad18917a80a0 Mon Sep 17 00:00:00 2001 From: Guilhem Moulin Date: Mon, 23 Jan 2012 20:34:31 +0100 Subject: PageBoundingBox; better handling of seekable inputs --- pdftool.pl | 154 +++++++++++++++++++++++++++++++++---------------------------- 1 file changed, 84 insertions(+), 70 deletions(-) (limited to 'pdftool.pl') diff --git a/pdftool.pl b/pdftool.pl index bbf3827..97c8d61 100755 --- a/pdftool.pl +++ b/pdftool.pl @@ -42,9 +42,9 @@ If no input file is given, or if a single hyphen-minus (I<->) is given as file name, B reads the PDF or PostScript data from the standard input. If a PDF is sent to the standard input, an auxiliary file is created (because reading a PDF requires random access to the data), and removed -afterwards. Also, if the option B<-c> (cropping) is set while the input is -not a regular seekable file, an auxiliary file is created, and removed -afterwards. +afterwards. Also, if the input size is not specified (options B<-W> and B<-H>, +or B<-P>) for a non seekable input file, an auxiliary file is created, +and removed afterwards. The input page size is by default guessed from the input document. However, the options B<-P>, B<-W> and B<-H> let you choose a specific input page @@ -62,8 +62,8 @@ processed (using the set of options given immediately before each argument) and then merged. A global set of options can be passed after the last input file: they will be applied to all the inputs (the local taking precedence, in case of clashes). -NOTE: The options B<-p>, B<-w>, B<-h>, B<--auto-rotate> and B<--ps> -refer to the output file only, and should be set only once. +NOTE: The options B<-p>, B<-w>, B<-h>, B<--auto-rotate>, B<--ps> and +B<-q> refer to the output file only, hence should be set only once. B does the following passes on the input documents: @@ -236,8 +236,7 @@ C<< ssh remote pdftool.pl -cpA4 --book -2 -b2cm -m-1cm --auto-rotate < in.pdf | =item * Requires C available via the command line (only if the input is -a PDF). Depending on your own version of this program, you might need to -hack the source yourself to remove the C<-origpagesizes> option :-/. +a PDF). =item * @@ -298,7 +297,7 @@ while (@ARGV) { "8" => sub { $config{nup} = 8 }, "9" => sub { $config{nup} = 9 }, "column" => sub { $config{column} = 1 }, - "q|quiet" => sub { $config{quiet} = 1}, + "q|quiet" => sub { $out{quiet} = 1}, "man" => sub { pod2usage(-exitstatus => 0, -verbose => 2) } ) or pod2usage(2); @@ -478,15 +477,22 @@ sub pdftops { # # Detect filetype, using input file's magic number # - # To avoid to seek into IN, it gonna be copied from WRITE to READ in - # the background, once the filetype has been read + # To avoid to seek into IN, the input will be copied from WRITE to + # READ in the background (if it's not seekable), + # once the magic number has been read + + my $seek = seek $IN, 0, 1; my $filetype; my ($READ, $WRITE); - pipe $READ, $WRITE or die "Cannot pipe: $!"; + unless ($seek) { + pipe $READ, $WRITE or die "Cannot pipe: $!"; + } $_ = <$IN>; - print $WRITE ($_) or die "Cannot print: $!"; + unless ($seek) { + print $WRITE ($_) or die "Cannot print: $!"; + } if (defined $_ && $_ =~ /^%!/) { $filetype = "PS"; } elsif (defined $_ && $_ =~ /^%PDF/) { @@ -496,22 +502,28 @@ sub pdftops { "' has an unknown magic number.\n"; } - unless (my $pid = fork) { - # Child: cat $IN > $WRITE in background - die "Cannot fork: $!" unless defined $pid; - close $READ or die "Cannot close: $!"; - - while (<$IN>) { - print $WRITE ($_) or die "Cannot print: $!"; - } - exit; + if ($seek) { + seek $IN, 0, 0 or die "Cannot seek: $!"; } + else { + unless (my $pid = fork) { + # Child: cat $IN > $WRITE in background + die "Cannot fork: $!" unless defined $pid; + close $READ or die "Cannot close: $!"; + + while (<$IN>) { + print $WRITE ($_) or die "Cannot print: $!"; + } + exit; + } + # Parent + close $WRITE or die "Cannot close: $!"; - # Parent - close $WRITE or die "Cannot close: $!"; - + $IN = $READ; + } - return $READ if $filetype eq "PS"; + + return $IN if $filetype eq "PS"; # @@ -531,7 +543,7 @@ sub pdftops { or die "Error: Cannot write into `" .$infile. "': $!\n"; # cat > $infile - while (<$READ>) { + while (<$IN>) { print $AUX ($_) or die "Cannot print: $!"; } close $AUX; @@ -576,10 +588,10 @@ sub pdftops { # Convert to PS # TODO: use gs & ps2write, more portable - my @cmd = ('pdftops', '-passfonts', '-level3', '-origpagesizes'); + my @cmd = ('pdftops', '-passfonts', '-level3'); push @cmd, '-f', $first if defined $first; push @cmd, '-l', $last if defined $last; - push @cmd, '-q' if exists $config->{quiet}; + push @cmd, '-q' if exists $out{quiet}; push @cmd, '--', $infile, '-'; my $pid = open $OUT, '-|', @cmd; @@ -598,7 +610,7 @@ sub psselect { my $OUT; my @cmd = ('psselect', '-p'. $config->{select}); - push @cmd, '-q' if exists $config->{quiet}; + push @cmd, '-q' if exists $out{quiet}; *IN = $IN; my $pid = open3 '<&IN', $OUT, '>&LOG', @cmd; @@ -614,8 +626,8 @@ sub psselect { # sub psbbox { my ($IN, $config) = @_; - my ($OUT, @bbox); - + my @bbox; + if (exists $config->{crop}) { # Calculate the maximal bounding box @@ -625,35 +637,34 @@ sub psbbox { my $auxfile = &mktemp( "pdftool-in$config->{index}-$$.ps" ); - open my $AUX, '>', $auxfile + open my $AUX, '+>', $auxfile or die "Cannot write into `" .$auxfile. "': $!\n"; # cat > $auxfile while (<$IN>) { print $AUX ($_) or die "Cannot print: $!"; } - close $AUX or die "Cannot close: $!"; close $IN or die "Cannot close: $!"; - - open $IN, '<', $auxfile or die "Cannot read `" .$auxfile. "': $!\n"; + $IN = $AUX; } + seek $IN, 0, 0 or die "Cannot seek: $!"; # Need to duplicate IN, since it will be closed in the parent process open *IN, '<&=', $IN or die "Cannot fdopen: $!"; my @cmd = (@gs, '-sDEVICE=bbox', '-dQUIET', '-dBATCH', '-dNOPAUSE', - '-f', '-'); + '-sOutputFile=%stdout%', '-f', '-'); my $pid = open3 '<&IN', '>&OUT', *OUT, @cmd; my ($p,$c) = (0,0); # Page & character counter my ($x0, $y0, $x1, $y1) = (1<<16, 1<<16, -(1<<16), -(1<<16)); while () { - if ($_ =~ m/^\%\%BoundingBox: (\d+) (\d+) (\d+) (\d+)/) { + if ($_ =~ /^\%\%BoundingBox: (\d+) (\d+) (\d+) (\d+)/) { $x0 = $1 if $1 < $x0; $y0 = $2 if $2 < $y0; $x1 = $3 if $3 > $x1; $y1 = $4 if $4 > $y1; - unless (exists $config->{quiet}) { + unless (exists $out{quiet}) { my $s = "[" . ++$p . "] "; $c += length $s; if ($c >= 80) { @@ -665,7 +676,7 @@ sub psbbox { } } close OUT or die "Cannot close: $!"; - print LOG "\n" or die "Cannot print: $!" unless exists $config->{quiet}; + print LOG "\n" or die "Cannot print: $!" unless exists $out{quiet}; # No zombie processes waitpid $pid, 0; @@ -677,47 +688,50 @@ sub psbbox { # Let's go back to the beginning of the input seek $IN, 0, 0 or die "Cannot seek: $!"; - $OUT = $IN; } elsif (exists $config->{inwidth} and exists $config->{inheight}) { @bbox = (0, 0, $config->{inwidth}, $config->{inheight}); - $OUT = $IN; } else { - # Guess page size from the input file + # Guess page size from the input file. If it isn't seekable, we + # have to create an auxiliary file (we have to go through all + # the file to calculate the proper bounding box); - # To avoid to seek into IN, it gonna be copied from WRITE to READ - # in background, once the Bounding Box has been read - my ($READ, $WRITE); - pipe $READ, $WRITE or die "Cannot pipe: $!"; - - # TODO: consider PageBoundingBox (and take the biggest) - while (not (@bbox) && defined (my $l = <$IN>)) { - print $WRITE ($l) or die "Cannot print: $!"; - @bbox = ($1, $2, $3, $4) - if ($l =~ m/^\%\%BoundingBox: (\d+) (\d+) (\d+) (\d+)/); + my $seek = seek $IN, 0, 1; + my $AUX; + unless ($seek) { + my $auxfile = &mktemp( "pdftool-in$config->{index}-$$.ps" ); + open $AUX, '+>', $auxfile + or die "Cannot write into `" .$auxfile. "': $!\n"; } - die "Cannot guess input page size.\n" unless @bbox; - - unless (my $pid = fork) { - # Child: cat IN > WRITE in background - die "Cannot fork: $!" unless defined $pid; - close $READ or die "Cannot close: $!";; - - while (<$IN>) { - print $WRITE ($_) or die "Cannot print: $!"; + my @pbbox = (1<<16,1<<16,-(1<<16),-(1<<16)); + while (<$IN>) { + unless ($seek) { + print $AUX ($_) or die "Cannot print: $!"; + } + chomp; + if ($_ =~ /^\%\%BoundingBox: (\d+) (\d+) (\d+) (\d+)$/) { + @bbox = ($1, $2, $3, $4); + } + elsif ($_ =~ /^\%\%PageBoundingBox: (\d+) (\d+) (\d+) (\d+)$/) { + $pbbox[0] = $1 if $pbbox[0]>$pbbox[2] or $1<$pbbox[0]; + $pbbox[1] = $2 if $pbbox[1]>$pbbox[3] or $2<$pbbox[1]; + $pbbox[2] = $3 if $pbbox[0]>$pbbox[2] or $3>$pbbox[2]; + $pbbox[3] = $4 if $pbbox[1]>$pbbox[3] or $4>$pbbox[3]; } - exit; } - # Parent + @bbox = @pbbox unless $pbbox[0]>$pbbox[2] or $pbbox[1]>$pbbox[3]; + die "Error: Cannot guess input page size.\n" unless @bbox; - close $WRITE or die "Cannot close: $!"; - - $OUT = $READ; + unless ($seek) { + close $IN or die "Cannot close: $!"; + $IN = $AUX; + } + seek $IN, 0, 0 or die "Cannot seek: $!"; } - return ($OUT, \@bbox); + return ($IN, \@bbox); } @@ -730,7 +744,7 @@ sub psbook { my $OUT; my @cmd = ('psbook'); - push @cmd, '-q' if exists $config->{quiet}; + push @cmd, '-q' if exists $out{quiet}; *IN = $IN; my $pid = open3 '<&IN', $OUT, '>&LOG', @cmd; @@ -873,7 +887,7 @@ sub psnup { } my @cmd = ('pstops', '-w'. $bbox->[2], '-h'. $bbox->[3], $pagespecs); - push @cmd, '-q' if exists $config->{quiet}; + push @cmd, '-q' if exists $out{quiet}; *IN = $IN; my $pid = open3 '<&IN', $OUT, '>&LOG', @cmd; @@ -1049,7 +1063,7 @@ sub min { # -# Make a temporary file, and remove it afterwards +# Make a temporary file with the given basename, and remove it afterwards # sub mktemp { my $auxfile = catfile( tmpdir(), $_[0] ); -- cgit v1.2.3