#!/bin/sh
# pdf2rdocx/pdf2rpptx: PDF to rasterized docx/pptx
# poppler-utils(pdftocairo, pdftoppm), netpbm, python-{docx,pptx}¸ߤ˰¸
# -vץѤξϤpdfinfo, pdftk, emfؤѴĤinkscapeˤ
# ¸äpython-docxemfѥåäƤɬפ

DPI=300; LQ=f

CMD="`basename \"$0\"`"
case "$CMD" in
*pptx)	EXT=pptx;;
*)	EXT=docx;;
esac

usage(){
	cat <<-EOF >&2
		Usage: $CMD [-r DPI] [-l] [-v pages] [PDFfile]
		  -r: resolution
		  -l: low quality (no antialiasing)
		  -v: pages to be converted to EMF instead of PNG
	EOF
	exit 1
}
# 
OPTERR=0
VPAGES=
while getopts r:lv: OPTC; do
	case "$OPTC" in
	r)	DPI="$OPTARG";;
	l)	LQ=t;;
	v)	VPAGES="$OPTARG";;
	*)	usage;;
	esac
done
shift `expr $OPTIND - 1`
CURWD="`/bin/pwd`" || exit 1
case $# in
1)	FROMPDF="$1"
	[ ! -e "$1" -a -e "$1".pdf ] && FROMPDF="$1".pdf
	[ -r "$FROMPDF" ] || {
		echo "$FROMPDF: cannot read" >&2
		exit 1
	}
	case "$FROMPDF" in
	/*)	:;;
	*)	FROMPDF="$CURWD/$FROMPDF";;
	esac;;
0)	[ -t 0 ] && usage
	FROMPDF=-;;
*)	usage;;
esac
# $FROMPDF:-פޤϥեХѥ

if [ -w "$HOME" ]; then
	TMPBASE="$HOME"/.p2d
else
	TMPBASE=/tmp/.p2d
fi
TMPD=
trap '
	ST=$?; cd /tmp
	case "$TMPD" in "$TMPBASE"*) rm -rf "$TMPD";; esac
	trap 0
	exit $ST
' 0 1 2 3 13 15
TMPD="`mktemp -d "$TMPBASE"XXXXXX`" || exit 1
# $TMPD:ǥ쥯ȥ̾ $TMPBASE:Υ١

# docx/pptxϡɸϤüξեءǤʤɸϤؽФ
if [ -t 1 ]; then
	case "$FROMPDF" in
	-)	TODOCX="$CURWD/stdin.$EXT";;
	*)	TODOCX="`perl -e '
			$_ = $ARGV[0]; s/\.pdf$//i; print
		' "$FROMPDF"`.$EXT";;
	esac
	if [ -e "$TODOCX" ]; then	# 񤭤뤫ʹ
		case "`echo -n`" in
		-*)	echon(){ echo "$@\c"; };;
		*)	echon(){ echo -n "$@"; };;
		esac
		while :; do
			echon "\`$TODOCX' exists; overwrite? (y/n) "
			if [ -t 0 ]; then # ɸϤüʤ/dev/tty
				read DUMMY
			else
				read DUMMY < /dev/tty
			fi
			case "$DUMMY" in [nN]*) exit;; [yY]*) break;; esac
		done
	fi
else
	TODOCX=
fi # $TODOCX:եХѥ뤤϶

cd "$TMPD" || exit 1	# ϰǥ쥯ȥǺ
TMPPDFBASE=a		# a˸
case "$FROMPDF" in
-)	:;;
*)	ln -s -- "$FROMPDF" "$TMPPDFBASE".pdf
	FROMPDF="$TMPPDFBASE".pdf;;
esac	# $FROMPDF-פǤʤʤ餽ϰǥ쥯ȥa.pdfȤsymlink

RPAGES=-
if [ X"$VPAGES" != X ]; then
	if [ "$FROMPDF" = - -o -S "$FROMPDF" -o -p "$FROMPDF" ]; then
		# -ξseekableǤäƤɬ
		cat "$FROMPDF" > "$TMPPDFBASE"_.pdf
		FROMPDF="$TMPPDFBASE"_.pdf
	fi
	AWKSCR='{sub(/[ \t]*:/, ": ")}$1 ~ /^Pages?:/{ # s?ΤǰΤ
		ok = 1; p = $2
		split(l, a, /,/)
		for(i = 1; i in a; i++){
			if(a[i] ~ /^[0-9]+$/){
				j = k = a[i]+0
			} else
			if(a[i] ~ /^[0-9]+-$/){
				j = a[i]+0; k = p
			} else
			if(a[i] ~ /^[0-9]+-[0-9]+$/){
				split(a[i], b, /-/)
				j = b[1]+0; k = b[2]+0
			} else exit
			if(k > p) k = p
			for(; j <= k; j++) pp[j]
		}

		for(i = 1; i <= p; i++){
			if(i in pp){
				for(j = i+1; j in pp; j++); j--
				v = v (i == j ? i : i "-" j) " "
				i = ++j
			}
		}
		for(i = 1; i <= p; i++){
			if(!(i in pp)){
				for(j = i+1; !(j in pp) && j <= p; j++); j--
				r = r (i == j ? i : i "-" j) " "
				i = ++j
			}
		}

		sub(/ $/, "", v); sub(/ $/, "", r)
		printf "VPAGES=\"%s\"; RPAGES=\"%s\"", v, r
		ok = 2; exit
	}END{exit(ok == 2 ? 0 : ok == 1 ? 1 : 2)}'
	TMPCMD="`pdfinfo "$FROMPDF" | awk -v l="$VPAGES" "$AWKSCR"`"
	case $? in
	0)	:;;
	1)	usage;;	# -vץθ
	*)	echo 'Input not a PDF?' >&2; exit 1;; # pdfinfoνϤ
	esac
#	echo "$TMPCMD" >&2
	eval "$TMPCMD" # VPAGES, RPAGES򥻥å
fi # Ū$VPAGESˤʤ뤳Ȥ⤢
# $VPAGESǤʤ礽줬EMFڡ$RPAGES()PNGڡ
#   $FROMPDFϰǥ쥯ȥΥե뤫symlink 
# $VPAGESξڡPNG ξ$FROMPDF-Ρ$RPAGESñ-
#   ǽ(PNGڡϤˤ$RPAGES)

ERF=err # Υե뤬줿餹exit

# EMFڡEMF
if [ X"$VPAGES" != X ]; then
	{
		pdftk "$FROMPDF" cat $VPAGES output - || touch "$ERF"
	} | pdftk - burst output "$TMPPDFBASE"-%d.pdf || exit 1
	# doc_data.txt˳ƥڡΥǡĤ
	[ -e "$ERF" ] && exit 1
	for i in "$TMPPDFBASE"-*.pdf; do
		LC_ALL=C inkscape -z "$i" --export-emf "$i".emf || exit 1
	done
	rm "$TMPPDFBASE"-*.pdf
	# a-1.pdf.emf a-2.pdf.emf Ĥ줿
fi

# PNGڡPNG
if [ X"$RPAGES" != X ]; then
	if [ X"$VPAGES" = X ]; then
		cat "$FROMPDF"
	else
		pdftk "$FROMPDF" cat $RPAGES output - || touch "$ERF"
	fi | 
	case $LQ in
	t)	pdftoppm -png -r "$DPI" -aa no -aaVector no - "$TMPPDFBASE";;
	*)	pdftocairo -png -r "$DPI" - "$TMPPDFBASE";;
	esac || exit 1
	# a-1.png, a-2.png,  뤤 a-01.png, a-02.png,  ʤɤ줿
	[ -e "$ERF" ] && exit 1
	for i in "$TMPPDFBASE"-*.png; do
		pngtopnm "$i" | pnmtopng -compress 9 -transparent =white \
			> "$TMPPDFBASE".png 2>/dev/null &&
		mv "$TMPPDFBASE".png "$i" || exit 1
	done # ξʤ(Υantialiasʤ)Ϥǥ
fi

filelist(){
	[ X"$VPAGES" != X ] &&
		egrep -i '^[ 	]*PageMedia(Rotation|Dimensions?)[ 	]*:' \
			doc_data.txt	# ѳ϶ȥ
	[ X"$RPAGES" != X ] &&
		file "$TMPPDFBASE"-*.png
}
# { ls -la; filelist; } >&2

filelist | python -c 'if True:
	import sys, datetime, re
	import PIL.Image
	PIL.Image.MAX_IMAGE_PIXELS = None # avoid DecompressionBombWarning
	if not "xrange" in dir(__builtins__): xrange = range

	(cmd, dpi, vpages, base, ext) = sys.argv[1:]
	dpi = float(dpi)
	vpages = [([int(i) for i in s.split("-")] if "-" in s else [int(s)]*2)
		for s in vpages.split()]

	if ext == "pptx":
		from pptx import Presentation
		from pptx.util import Inches, Pt
	else:
		from docx import Document
		from docx.shared import Inches, Pt
		from docx.enum.section import WD_ORIENT

	rxylist = []
	vxylist = []
	vidx = 0
	for l in sys.stdin:
		f = re.sub("\s*:", ": ", l, 1).split()
		if f[0].startswith(base):
			rxylist.append((f[0].rstrip(":"), 
				Inches(int(f[4]) / dpi) ,
				Inches(int(f[6].rstrip(",")) / dpi)))
		else:
			if "rot" in f[0].lower():
				rot = int(f[1]) in (90, 270)
				continue
			# Dimensions
			if rot:
				f[1],f[2] = f[2],f[1]
			vidx += 1
			vxylist.append(("{0}-{1}.pdf.emf".format(base, vidx),
				Pt(float(f[1])), Pt(float(f[2]))))

	if ext == "pptx":
		d = Presentation()
		SLD_LAYOUT_BLANK = 6
		blank_slide_layout = d.slide_layouts[SLD_LAYOUT_BLANK]

		d.slide_width, d.slide_height = [max(x) for x in
			zip(*map(lambda a:a[1:], rxylist + vxylist))]
		def register(d, name, x, y):
			s = d.slides.add_slide(blank_slide_layout)
			s.shapes.add_picture(name,
				(d.slide_width - x) / 2,
				(d.slide_height - y) / 2,
				width = x)
	else:
		d = Document()

		def register(d, name, x, y):
			if len(d.inline_shapes) == 0:
				s = d.sections[0]
			else:
				s = d.add_section()
			s.page_width = x
			s.page_height = y
			s.left_margin = Inches(0)
			s.right_margin = Inches(0)
			s.top_margin = Inches(0)
			s.bottom_margin = Inches(0)
			s.orientation = WD_ORIENT.PORTRAIT
			d.add_picture(name, width = s.page_width)

	for i in xrange(1, len(rxylist) + len(vxylist) + 1):
#		print >>sys.stderr, len(rxylist), len(vxylist), vpages
		if not vpages or i < vpages[0][0]:
			xylist = rxylist
		else:
			if i == vpages[0][1]:
				del vpages[0]
			xylist = vxylist
		(name, x, y) = xylist[0]
#		print >>sys.stderr, name
		del xylist[0]

		register(d, name, x, y)

	c = d.core_properties
	c.author = cmd
	c.comments = c.last_modified_by = ""
	c.created = c.modified = datetime.datetime.now()

	d.save(base + "." + ext)
' "$CMD" "$DPI" "$VPAGES" "$TMPPDFBASE" "$EXT"	# a.pptxa.docx

case "$EXT" in docx)
	DOCXML=word/document.xml
	unzip "$TMPPDFBASE.$EXT" "$DOCXML" >/dev/null 2>&1
	perl -i -pe '
		BEGIN{$/ = ">"}
		s/(?<=<)wp:inline\b.*?(?=>)/
			$cnt++, ($& =~ m|\bdist[LRTB]=| && exit(1)),
			"$& distL=\"0\" distR=\"0\" distT=\"0\" distB=\"0\""
		/ges;
		END{exit(1) if !$cnt}
	' "$DOCXML" || {
		echo 'Unexpected output from python-docx' >&2
		exit 1
	}
	zip -T -m "$TMPPDFBASE.$EXT" "$DOCXML" >/dev/null 2>&1 &&
	[ ! -e "$DOCXML" ] || {
		echo 'Failed creating .docx file' >&2
		exit 1
	} # zip -fǤϤޤޥॹ׺1ðȹʤΤ
	;;
esac

case "$TODOCX" in
'')	cat "$TMPPDFBASE.$EXT";;
*)	mv "$TMPPDFBASE.$EXT" "$TODOCX";;
esac
