#!/usr/bin/perl

# PDFΥ֥ȤΤƱƤΤΤޤȤġ
# pdftgifθ˻Ѥ뤳Ȥ
# Usage: thiscommand [PDFfile [Outfile]] (ե̾夨ơ-פǤ)
# ɬפʤ: PerlDigest::MD5⥸塼

# : ƱƤΥ֥ȤʣСؤݥ󥿤
# Τ1Ĥؤݥ󥿤˽񤭴롣θpdftk̤κݤ˰̤
# ꤹ뤳ȤˤԻѤΥ֥Ȥ

# ֥Ȼȥơ֥+ȥ졼鼭ȤָŤ餢빽¤PDF
# ƤꡢǤʤ缫ưŪ˰öpdftk̤ȤǤι¤ˤʤ뤳Ȥ
# ξ硢Ǹνȹ碌2pdftk̤Ȥˤʤ
# ޤξ硢礭PDFˤʤ뤳Ȥ⤢

# ¸߰յ: pdftgifϸPDFڡˤФ餷ơùƺƷ礹߷פǤ
# ΤᡢPDF˴ޤޤե(ʤ)ΥǡФ餵줿ڡ
# ޤޤ졢Ʒ礹̡PDFˤϥڡ֤Ʊǡ
# ޤޤ뤳ȤˤʤꡢPDF粽Ȥˤʤ
# ڡФ餵˽߷פˤФʤʤäΤpdftgif
# ߷פΤ
# ǡpdftgifνPDF򤳤ΥġǽƱեȥǡ(ʤ)
# ٤ޤޤʤ褦ˤ褦Ȥ

use File::Temp 'tempfile';
eval 'require Digest::SHA; sub new_dgst{Digest::SHA -> new(256)}' ||
eval 'require Digest::MD5; sub new_dgst{Digest::MD5 -> new}' ||
	die "Neither Digest::SHA nor Digest::MD5 is installed\n";

@ARGV <= 2 || die "Usage: $0 [PDFfile [Outfile]]\n";
($fnm, $ofnm) = (@ARGV, "-", "-");

$fnm eq '-' ? open($inf, '<&', STDIN) :
	open($inf, '<', $fnm) || die "$fnm: $!\n";
# $infseekǽˤ
unless(seek($inf, 0, 1)){
	my($tmpf, $s);
	$tmpf = tempfile() || die "$!\n";
	print $tmpf $s || die "$!\n" while read $inf, $s, 4096;
	close $inf;
	$inf = $tmpf;
}
open $orgf, '<&', $inf;

sub getc{ # 1ʸundef֤
	my($inf, $c) = $_[0];
	read($inf, $c, 1) ? $c : undef;
}
sub pass_spc{ # ʸޤǥեݥ󥿤ư ֤ʤ
	my($inf, $c) = $_[0];
	OUTER: {
		# $cEOFˤʤޤ
		1 while ($c = getc($inf)) =~ /^\s$/a;
		return unless defined $c;
		if($c eq '%'){ # ɬ?
			# $cԤEOFˤʤޤ
			1 while ($c = getc($inf)) =~ /^[^\r\n]$/;
			return unless defined $c;
			redo OUTER;
		}
	}
	seek($inf, -1, 1);
}
sub getword{ # ФƼڤʸΤޤ(Τʤ)֤
	my($inf, $c, $ret) = $_[0];
	pass_spc($inf);
	# $c%EOFˤʤޤ(%θɬ?)
	$ret .= $c while ($c = getc($inf)) =~ /^(?![][()<>{}\/%])[!-~]$/a;
	seek $inf, -1, 1 if defined $c; # %θʤ餳ɬ
	# print STDERR "[[$ret]]";
	$ret;
}
sub getxref{ # xrefפθμޤǥեݥ󥿤ư
	# $xrefposxrefƬ֤򥰥Х˥å
	my($inf, $s, $size, $pos) = $_[0];
	seek $inf, 0, 2 || die "$!\n"; # $infκǸ
	$size = tell $inf;
	seek $inf, -($size > 1024 ? 1024 : $size), 2;
	read $inf, $s, 1024;
	$s =~ /\sstartxref\s+(\d+)(?:\s+%%EOF)+\s*$/a || unexpected(0);
	# %%EOFʣPDF
	seek $inf, $pos = $1, 0;
	getword($inf) eq 'xref' ? ($xrefpos = $pos): undef;
}

{
	my $already;

	sub unexpected{ # PDF(pdftkǽƤ)Ԥ¤ˤʤäƤʤ
		die sprintf "$fnm: %s ($_[0])\n", $already ?
			"pdftk failed or did not generate expected result" :
			"this PDF did not have expected structure";
	}

	sub repair{ # ǹ1PDFpdftkǽľ$inf򥻥åȤľ
		unexpected(1) if $already++;
		my($inf, $pid, $tmpf) = $_[0];
		($tmpf = tempfile()) && defined($pid = fork) || die "$!\n";
		unless($pid){ # child
			seek $inf, 0, 0; open STDIN, '<&', $inf;
			open STDOUT, '>&', $tmpf;
			close $inf; close $tmpf;
	 		exec 'pdftk', '-', 'output', '-';
		} else {
			close $inf;
			wait; die "pdftk abend\n" if $?;
		}
		$tmpf;
	}
}

OUTER: {
	unless(getxref($inf)){
		# startxrefΰ֤xrefĤʤ(PDF1.6ʤ)
		# öpdftk̤
		$inf = repair($inf); redo OUTER;
	}

	@objtbl = (); @gentbl = ();
	while(1){
		my($s, $c, $p, $g, $f);
		# xrefμΡֹ֥֥ Ŀפɤ
		$s = getword($inf);
		$s eq 'trailer' ? last : unexpected(2) if $s !~ /^\d+$/a;
		$c = getword($inf); unexpected(3) if $c !~ /^\d+$/a;
		$s += 0; $c += 0;

		if(!$c){
			# xrefˡָĿ0פΥȥ꤬öpdftk̤
			$inf = repair($inf); redo OUTER;
		}
		# print STDERR "<<$s|$c>>\n";

		# ֥Ȥλȥȥɤ
		while($c--){
			$p = getword($inf); $g = getword($inf);
			$f = getword($inf);
			unexpected(4) if $p !~ /^\d{10}$/a || $g !~ /^\d{5}$/a
				|| $f !~ /^[nf]$/;
			if($f ne 'n'){
			# ֹ0̤ѥȥ꤬öpdftk̤
				if($s){
					$inf = repair($inf); redo OUTER;
				}
			} else {
			# nʤΤ˻褬0Ǥ
			# xrefΰ֤Ǥöpdftk̤
				if($p == 0 || $xrefpos < $p){
					$inf = repair($inf); redo OUTER;
				}
				$objtbl[$s] = $p + 0; $gentbl[$s] = $g + 0;
			}
			$s++;
		}
	}
}
# λ @objtbl: ֹ֥椫ХȰ֤ؤ
# @gentbl: ֹ֥椫ֹؤ
for(my $i = 1; $i < @objtbl; $i++){$robjtbl{$objtbl[$i]} = $i} # @objtblε

@objpos = (sort {$a <=> $b} grep {$_ ne ''} @objtbl, $xrefpos);
# λ @objpos: ֥ȤγϥХȰ֤ξΰ
# ϥֹ֥Ȥбʤ ޤǸxrefγϰ֤ɲäƤ

# print STDERR "@objtbl\n";
# for $i(sort{$a<=>$b}keys %robjtbl){print STDERR "$i-$robjtbl{$i} "}
#	print STDERR "\n";
# print STDERR "@objpos\n";

my $has_dupobj;
for(my $i = 0; $i < @objpos - 1; $i++){
	# ƥ֥Ȥ(ֹ  objפ塢endobjޤǤޤ)
	# Υϥå
	my($n1, $n2, $o);
	seek $inf, $objpos[$i], 0;
	# print STDERR "$i,$objpos[$i],$n1,$n2,$o," .
	#	"$robjtbl{$objpos[$i]},$gentbl[$n1]\n";  
	$n1 = getword($inf); $n2 = getword($inf); $o = getword($inf);
	unexpected(5) if $n1 !~ /^\d+$/a || $n2 !~ /^\d+$/a || $o ne 'obj' ||
		$n1 != $robjtbl{$objpos[$i]} || $n2 != $gentbl[$n1];

	pass_spc($inf);
	my($amnt, $s, $ss, $sss) = $objpos[$i+1] - tell $inf;

	# next if $amnt <= 200; # ʤоݤˤʤ

	my $ctx = new_dgst();
	while($amnt > 0){
		my $cnt = ($amnt > 4096 ? 4096 : $amnt);
		read $inf, $s, $cnt; $amnt -= $cnt;
		$s =~ s/(\s*$)//; $ss = $1;
		if($s eq ''){
			$sss .= $ss;
		} else {
			$ctx -> add($sss . $s); $sss = $ss;
		} # endobjޤǤޤޤ
	}

	my $d = $ctx -> digest;
#	my $d = $ctx -> hexdigest;
	$obj_digest[$n1] = $d;
	my $ng = "$n1 $gentbl[$n1]";
	$dgst_objgen{$d} = $ng if !defined $dgst_objgen{$d} ||
		($has_dupobj = 1, length($dgst_objgen{$d}) > length($ng));
	# print STDERR "$n1|$obj_digest[$n1]\n";
} # $dgst_objgen{$i}: Ƥϥå$iġֹ֥֥ ֹפκû

# for my $i (keys %dgst_objgen){print STDERR "$i $dgst_objgen{$i}\n"}
# print STDERR $has_dupobj+0, "\n";

open(STDOUT, '>', $ofnm) || die "$ofnm: $!\n" if $ofnm ne '-';

# ƱƤʣΥ֥Ȥ¸ߤ$has_dupobj1ˤʤäƤ
# Ǥʤ縵ե򤽤Τޤ޽Ϥƽλ
# ʤƱƤɤϥϥåΰפΤߤȽ(ϥåξͤϹθƤʤ)
unless($has_dupobj){
	seek $orgf, 0, 0;
	my $s;
	print $s || die "$!\n" while read $orgf, $s, 4096;
	exit;
}
close $orgf;

# Ϥpdftk compressǽ
open $otf, '|-', 'pdftk', '-', 'output', '-', 'compress' || die "$!\n";

seek $inf, 0, 0;
my $s;
# եκǽΥ֥ȤľޤǤΤޤ޽
read $inf, $s, $objpos[0];
print $otf $s;

# եγƥ֥ȤեƬɤ
for(my $i = 0; $i < @objpos - 1; $i++){
	my($amnt, $cnt, $s, $anterior) = $objpos[$i+1] - tell $inf;
	$cnt = ($amnt > 16384 ? 16384 : $amnt);
	read $inf, $s, $cnt; $amnt -= $cnt;

	# ΤؤβϤŬ Ƭ16kХȤƤʤ ̾ʤɤ
	# ƹǤⴰˤϲϤƤʤ  streamܤ˲Ϥ
	# ʤΤ streamȯǲϤλ
	# ϤԴ 񤭴ƤϤʤȤäŦФϤƤʤϤ
	while($s =~ s/^(
		\s+
		| %[^\r\n]*[\r\n]
		| \d+\s+\d+\s+R\b
		| [-+]?(?:\d+(?:\.\d*)?|\.\d+)
		| (?:obj|true|false|null)\b
		| \/(?:(?![][()<>{}\/%\#])[!-~]|\#[\dA-Fa-f]{2})*
		| << | >> | \[ | ]
		| \((?:[^)\\]|\\[ -~])*\) | <[\dA-Fa-f]*>
	)(?=.)//xsa){
		my $ss = $1;
		# ֥ȤؤλȤ򸫤Ĥ %dgst_objgen򸫤ƽ񤭴
		if($ss =~ /^(\d+)\s+(\d+)\s+R\z/ && defined $obj_digest[$1]){
			my $ng = $dgst_objgen{$obj_digest[$1]};
			unexpected(6) if $2 != $gentbl[$1] || !defined $ng;
			$ss = sprintf("%-*s R", length($ss) - 2, $ng);
		}
		$anterior .= $ss;
	}
	print $otf $anterior, $s;
	# ϤʤäĤʬ򤽤Τޤ޽
	while($amnt > 0){
		my $cnt = ($amnt > 4096 ? 4096 : $amnt);
		read $inf, $s, $cnt; $amnt -= $cnt;
		print $otf $s;
	}
}

# ǸΥ֥Ȥʬ򤽤Τޤ޽
print $otf $s || die "$!\n" while read $inf, $s, 4096;
# closepdftkؤpipeĤ뤫
close $otf || die "$!\n";
