#!/usr/bin/env bash
# Using a file of codepoints from create-codepoint-files, put them all
# (from space onwards) into a text file, to look at their appearance.
# Each line contains a filling mark wherever the glyph is not present.
# This can then be opened in libreoffice and the glyphs changed to
# use the appropriate font.
#
# For combining characters, write a space first (it does not seem
# practical for me to do something similar for complex scripts, so
# those can still look messy in places.
#
# Copyright © 2013-6 Ken Moffat
# covered by the MIT license, http://opensource.org/licenses/MIT

#SHOWFROM=
#SHOWTO=

#SHOWDECFROM=
#SHOWDECTO=

# hex value to use where a glyph is not present
# 25A1 [ U+25A1 ] is good for fixed width, e.g. CJK ranges,
# but only if the font contaisn it ;-)
FILLING=0020

# note if anything is in the array, waiting to be written
BUFFERED=

# uses decimal for the maths, to avoid the runtime overhead of either
# loading bc to calculate the next value in the range
# or of using a double printf (hex->dec, maths. dec->hex) because
# that needs sed to convert lowercase hex to uppercase (or tr, which is
# even slower)

if [ $# -ne 1 ]; then
	echo "Usage: $0 /path/to/font.codepoints"
	exit 1
fi

# the files contain U+.... and U+..... in alphabetic order
# if there are any 5-digit glyphs, write them to a temp file
# and then process that.
SUPP_PLANE=
EXTENDED=/tmp/supp-planes
>$EXTENDED

# variables for current 32-byte block
# set stored if something is in the array
STORED=

# GRP - a 16 or 32-byte group of codepoints, to sit on one line of output
GRPFROMH=
let GRPFROMD=0
GRPTOH=
GRPTOD=


# decimal value of current hex value
DECIMAL=

# number of this block
BLOCK=0
# number of last block in current table
MAXBLOCK=249
# working storage for setting title correctly.
OLDBLOCK=

# maximum characters in this line or group, default 31
MAXIDX=31

# START is used when a block changes, ensure it is initialised
# when it is read in other functions.  Similarly OFFSET.
let START=32
let OFFSET=0

# the blocks file should be in the same directory as this script
SCRIPTS=${0%/*}

DUMP=0
if ! [ -r ${SCRIPTS}/unicode-blocks ]; then
	echo "cannot read ${SCRIPTS}/unicode-blocks, oh dear"
	error
fi
. ${SCRIPTS}/unicode-blocks

# functions

error () {
	rm -f $EXTENDED
	exit 1
}

store () {
	# update what would be next if the sequence continues
	let NEXT=$DECIMAL+1
}

decimal () {
	# get decimal value, for shell maths
#echo "decimal called for $UNICODE" >&2
	DECIMAL=$(printf "%d" 0x${UNICODE})
}

title () {
	echo ${blocknames[$BLOCK]}
	echo
}

# functions added for generate-characters

set_title () {
	#echo "set_title called for block $BLOCK" >&2
	while [ ${blockends[$BLOCK]} -le $DECIMAL ]; do let BLOCK=$BLOCK+1 ; done
	echo
	#echo ${blocknames[$BLOCK]}
	title
	# now detect if this is a block of Combining characters
	#echo ${blocknames[$BLOCK]} | grep -q 'Combining' && IS_COMBINING=Y || IS_COMBINING=
	echo ${blocknames[$BLOCK]} | grep -q 'Combining'
	if [ $? -eq 0 ]; then
		IS_COMBINING=Y
		#echo "COMBINING at ${blocknames[$BLOCK]}" >&2
	else
		IS_COMBINING=
		#echo "normal at ${blocknames[$BLOCK]}" >&2
	fi
}

get_args () {
	# read the range from parms, and optionally what to use
	# as a filler where codepoints are not in the font
	# default is white square
	#if [ -z "$FROM" ]; then
		FROM=0020
	#fi
	UNICODE=$( echo $FROM | sed 's/U+//')
	decimal
	#if [ $? -ne 0 ]; then
	#	echo "invalid {U+}hex value passed as FROM : $FROM" >&2
	#	error
	#fi
	SHOWFROM=$FROM
	SHOWDECFROM=$DECIMAL

	#if [ -z "$TO" ]; then
		TO=65535
	#fi
	UNICODE=$(echo $TO | sed 's/U+//')
	decimal
	#if [ $? -ne 0 ]; then
	#	echo "invalid {U+}hex value passed as TO : $TO" >&2
	#	error
	#fi
	SHOWTO=$TO
	SHOWDECTO=$DECIMAL

	if [ $SHOWDECFROM -ge $SHOWDECTO ]; then
		echo "range error, $SHOWFROM not less than $SHOWTO" >&2
		exit
	fi

	UNICODE=
#echo "range will be $SHOWFROM ($SHOWDECFROM) to $SHOWTO ($SHOWDECTO)" >&2

	if [ -n "$FILLER" ]; then
		UNICODE=$(echo $FILLER | sed 's/U+//')
		decimal
		if [ $? -ne 0 ]; then
			echo "invalid {U+}hex value specifeid for FILLER"
			error
		fi
		FILLING=$(echo $FILLER | sed 's/U+//')
	fi
echo "will fill any gaps with U+$FILLING" >&2
}

calc_block () {
	OLDBLOCK=$BLOCK
	while [ ${blockends[$BLOCK]} -le $DECIMAL ]; do
		# blockends holds values of first char in NEXT block
		# save that while looping, to get start of the new current block
		START=${blockends[$BLOCK]}
		let BLOCK=$BLOCK+1
	done
	if [ $BLOCK != $OLDBLOCK ]; then
		echo
		#echo ${blocknames[$BLOCK]}
		title
		# now detect if this is a block of Combining characters
		#echo ${blocknames[$BLOCK]} | grep -q 'Combining' && IS_COMBINING=Y || IS_COMBINING=
		echo ${blocknames[$BLOCK]} | grep -q 'Combining'
		if [ $? -eq 0 ]; then
			IS_COMBINING=Y
			#echo "COMBINING at ${blocknames[$BLOCK]}" >&2
		else
			IS_COMBINING=
			#echo "normal at ${blocknames[$BLOCK]}" >&2
		fi
		# determine if the block starts at a 16-byte offset
		let OFFSET="${START} % 32"
		#echo "offset from $START is $OFFSET" >&2
		# force the group to be at the start of the block -
		# needed if there is a short group at end of previous block
		let GRPFROMD=$START
		let GRPTOD="$START+31"
		let IDX="$DECIMAL-$GRPFROMD"
		# if first line is missing, index will not fit in buffer
		# this belt-and-braces will slow if a lot of lines are missing
		while [ $IDX -gt 31 ]; do
			let GRPFROMD=$GRPFROMD+32
			let GRPTOD=$GRPTOD+32
			let IDX="$IDX-32"
		done
		# and refresh MAXIDX - not sure if 31 is reliable
		let MAXIDX=31
		let TEST="$GRPFROMD+$MAXIDX"
		if [ $TEST -gt $GRPTOD ]; then
			let MAXIDX=15
		fi
	fi
}

calc_group () {
	# calculate the 32-byte group (line) this codepoint is within
	# actually, set up NEW values
	if [ $OFFSET = 0 ]; then
		# a 32-byte-aligned start; straightforward calculation
		# round down to units of 32 then add 31 for end
		let NEWGRPD="$DECIMAL/32"
		let NEWGRPFROMD="($NEWGRPD*32)"
		let NEWGRPTOD="$NEWGRPFROMD+31"
	else
		# 16 byte aligned.  Starts at start / 32 + 16, goes to N+1 +15
		# but we do not know what N is, might already be at N+1
		# for these cases, begin at START
		let END="$START + 31"
		while [ $END -lt $DECIMAL ]; do
			let END=$END+32
		done
		let NEWGRPFROMD="$END-31"
		let NEWGRPTOD="$END"
	fi
}

declare array codepoints

initialise_array () {
	for IDX in {0..31} ; do
		codepoints[$IDX]=$FILLING
	done
}

flush () {
	# print the line header to stdout :
	#echo "flush" >&2
	if [ -n "$BUFFERED" ]; then
		echo -en "U+$GRPFROMH-$GRPTOH\t"
		#echo -en "\nU+$GRPFROMH-$GRPTOH\t" >&2
		# cannot nest variables in math!
		if [ $MAXIDX -eq 31 ]; then
			for IDX in {0..31} ; do
				# for combining character, write a space first
				# so that I can read the result instead of
				# getting a nasty smudge
				test "$IS_COMBINING" = "Y" && echo -en " "
				#test "$IS_COMBINING" = "Y" && echo  "Y" >&2
				#test "$IS_COMBINING" != "Y" && echo "X" >&2
				# write the character
				echo -en "\U${codepoints[$IDX]}"
			done
		else
			for IDX in {0..15} ; do
				# for combining character, write a space first
				# so that I can read the result instead of
				# getting a nasty smudge
				test "$IS_COMBINING" = "Y" && echo -en " "
				#test "$IS_COMBINING" = "Y" && echo  "Y" >&2
				#test "$IS_COMBINING" != "Y" && echo "X" >&2
				# write the character
				echo -en "\U${codepoints[$IDX]}"
			done
		fi
		# now do the newline
		echo
	fi
	BUFFERED=
}

set_group_hex () {
	# use GRPFROMD and GRPTOD to get the hex range for this group,
	# because there is at least one codepoint to be output.
	# this is quite expensive
	HEX=$(printf "%x" $GRPFROMD | tr 'a-f' 'A-F')
	# need to pad this to 4 digits if less
	LEN=${#HEX}
	# first block starts at zero
	case $LEN in
		1)
			PREFIX="000" ;;
		2)
			PREFIX="00" ;;
		3)
			PREFIX="0" ;;
		*)
			PREFIX= ;;
	esac
	GRPFROMH="${PREFIX}${HEX}"
	HEX=$(printf "%x" $GRPTOD | tr 'a-f' 'A-F')
	# the end of a block will have the same number of digits as its start
	GRPTOH="${PREFIX}${HEX}"
	LEN=${#GRPTOH}
#echo "GRPTOH is $GRPTOH with length $LEN"
	until [ $LEN -eq 2 ]; do
		GRPTOH=$(echo $GRPTOH | sed 's/^.//')
		let LEN=$LEN-1
	done

#echo "group is U+$GRPFROMH-$GRPTOH"
}

store_char () {
	# we have a unicode value, without the U+, of interest.
	# process it into the storage / flush as necessary

	# first, save current block, to detect when it should change
	OLDBLOCK=$BLOCK

#echo "store_char for $UNICODE $DECIMAL with GRPFROMD $GRPFROMD and NEWGRPFROMD $NEWGRPFROMD" >&2
	# calculate index to line to check if a line might is full
	let IDX="$DECIMAL-$GRPFROMD"
	#echo "IDX for $UNICODE set to $IDX to flush if beyond buffer" >&2
	if [ $GRPFROMD -eq 0 ]; then
		set_title
		initialise_array
		calc_group
		let GRPFROMD=$NEWGRPFROMD
		let GRPTOD=$NEWGRPTOD
		# ensure the address range for first line is set.
		set_group_hex
	fi
	#echo "CHECK groups $GRPFROMD $NEWGRPFROMD" >&2
	if [ $DECIMAL -gt $GRPTOD ] || [ $IDX -gt $MAXIDX ]; then
		flush
		# reinitialise the array
		initialise_array
		calc_group
		let GRPFROMD=$NEWGRPFROMD
		let GRPTOD=$NEWGRPTOD
		calc_block
		#echo "array initialised with GRPFROMD set to $GRPFROMD" >&2
		#echo "title now ${blocknames[$BLOCK]}" >&2
		#  hack - some blocks are a multiple of 16, not a multiple of 32
		# so subtract 16 if beyond the block end
		let MAXIDX=31
		let GRPENDD="$GRPFROMD + 31"
		#echo "$UNICODE GRPENDD starts as $GRPENDD GROUPFROMD is $GRPFROMD" >&2
		if [ $GRPENDD -gt ${blockends[$BLOCK]} ]; then
			#echo "GRPTOD  begins as $GRPTOD" >&2
			let GRPTOD="$GRPTOD - 16"
			let MAXIDX=15
			#echo  "block index is $BLOCK, end is ${blockends[$BLOCK]}" >&2
			#echo "GRPTOD reduced to $GRPTOD, MAXIDX reduced to 15" >&2
		fi
		#echo "MAXIDX is $MAXIDX" >&2
		if [ $DUMP = 1 ]; then
			echo "new array, grptod $GRPTOD index $IDX max $MAXIDX"
		fi
		set_group_hex
		#echo "MAXIDX for new buffer is $MAXIDX" >&2
		#echo "zeroing IDX" >&2
		let IDX="$DECIMAL-$GRPFROMD"
		#echo "IDX for $UNICODE set to $IDX" >&2
	fi
	#echo "IDX is $IDX DECIMAL is $DECIMAL" >&2
	#echo "storing $UNICODE at index $IDX" >&2
	if [ $IDX -gt 31 ] || [ $IDX -lt 0 ]; then
		echo "U+${UNICODE} ${DECIMAL}d : bad index ${IDX} block is $BLOCK"
		echo "group is ${GRPFROMH}-${GRPTOH} or ${GRPFROMD}-${GRPTOD}"
		echo "fubar"
		exit 2
	fi
	codepoints[$IDX]=$UNICODE
	BUFFERED=true
}

# main line starts here

if ! [ -r $1 ]; then
	echo "cannot read $1" >&2
	error
fi

get_args

echo $1 | grep -q codepoints
if [ $? -ne 0 ]; then
	echo "input should be a .codepoints file" >&2
	exit 1
fi

# Print a title at the front of the file
FONTNAME=$(echo $1 | sed -e 's%.*/%%' -e 's%.codepoints%%')
echo  "Glyphs in font $FONTNAME"

# this assumes the range will NOT overlap the first plane
# needs attention later
while read line
do
	#echo "main is $line" >&2
	UNICODE=$(echo $line | awk '{ print $1 }' | sed 's/U+//')
	LENGTH=$(echo ${#UNICODE})
	#echo "length is $LENGTH fpr $UNICODE" >&2
	if [ $LENGTH -gt 4 ]; then
		# supplemental plane:
		# this might come DURING a range, e.g. U+1AEF U+1AEF0 U+1AF0
		# so do NOT flush any range
		SUPP_PLANE=true
		echo $UNICODE >>$EXTENDED
		continue
	fi

	# find decimal value of this codepoint, to compare
	decimal

	# control debugging - for specific codepoint (in decimal)
	# e.g. set to 8679 for U+21E7, 0 turns it off
	if [ "$DECIMAL" = 0 ]; then
		DUMP=1
	else
		DUMP=0
	fi

	if [ $DECIMAL -gt 32 ]; then
		store_char
	fi

done  <$1
# need to flush anything in storage
flush


if [ -n "$SUPP_PLANE" ]; then
while read line
do
	#echo "extended is $line" >&2
	UNICODE=$(echo $line | awk '{ print $1 }' | sed 's/U+//')
	decimal
	#if [ $DECIMAL -ge $SHOWDECFROM ]; then
	#	if [ $DECIMAL -gt $SHOWDECTO ]; then
	#		break
	#	else
			store_char
	#	fi
	#fi
done <$EXTENDED
flush
fi
# clean up
rm $EXTENDED
