#!/bin/bash
# Create a codepoint file for a TTF or OTF font, using two methods
# - ttf2config.pl (included in the examples part of Font-TTF-Scripts)
# often works well, but for some fonts it misses certain things.
# - my get_codepoints prog uses fontconfig, but for some fonts that
# does not admit to the presence of certain codepoints - either they
# are correctly whitespace, or in practice they are empty.
# In an ideal world I would ignore the latter, but I need to know
# which whitespace characters exist, so I merge both outputs.
#
# Then create the corresponding coverage file.
#
# The font MUST be known to fontconfig (i.e. install it and run fc-cache)
# But in addition, if it is part of a ttc it must be extracted to a ttf
# (use fontforge to generate a ttf) and that ttf must passed to this script.
#
# NB - this creates working files with predictable names in current
# directory, in case needed for debugging.
#
# Copyright © 2016 Ken Moffat
# covered by the MIT license, http://opensource.org/licenses/MIT

if [ $# -ne 1 ]; then
	echo "supply /path/to/fontfile"
	echo "Use on ttf or otf files known to fontforge"
	exit 1
fi

# functions ---------------------------

check_if_ttc () {
	# for a ttc I have to generate the ttf,
	# so I know the names and what fontconfig
	# name matches (I only found 3 libre ttcs,
	# plus one dubious one that fontconfig
	# could not understand).
	if [ $FILENAME == "odosung.ttf" ]; then
		FONTNAME="AR PL New Sung"
		return 0
	elif [ $FILENAME == "odosungmono.ttf" ]; then
		FONTNAME="AR PL New Sung Mono"
		return 0
	elif [ $FILENAME == "UKaiCN.ttf" ]; then
		FONTNAME="AR PL UKai CN"
		return 0
	elif [ $FILENAME == "UKaiHK.ttf" ]; then
		FONTNAME="AR PL UKai HK"
		return 0
	elif [ $FILENAME == "UKaiTW.ttf" ]; then
		FONTNAME="AR PL UKai TW"
		return 0
	elif [ $FILENAME == "UKaiTWMBE.ttf" ]; then
		FONTNAME="AR PL UKai TW MBE"
		return 0
	elif [ $FILENAME == "UMingCN.ttf" ]; then
		FONTNAME="AR PL UMing CN"
		return 0
	elif [ $FILENAME == "UMingHK.ttf" ]; then
		FONTNAME="AR PL UMing HK"
		return 0
	elif [ $FILENAME == "UMingTW.ttf" ]; then
		FONTNAME="AR PL UMing TW"
		return 0
	elif [ $FILENAME == "UMingTWMBE.ttf" ]; then
		FONTNAME="AR PL UMing TW MBE"
		return 0
	elif [ $FILENAME == "WenQuanYiZenHei.ttf" ]; then
		FONTNAME="Wen Quan Yi Zen Hei"
		return 0
	elif [ $FILENAME == "WenQuanYiZenHeiMono.ttf" ]; then
		FONTNAME="Wen Quan Yi Zen Hei Mono"
		return 0
	else
		return 1 # not from a known ttc
	fi
	# check that fontconfig knows of it - add ':' to prevent
	# TW matching TW MBE (unlikely only one ttf installed,
	# but play safe
	fc-list | grep -q "$FONTNAME:" && return 0
	echo "$FONTNAME has not been installed!"
	exit 1
}

# the scripts should be in the current directory
SCRIPTS=${0%/*}

# main line -----------------------------
if ! [ -r ${SCRIPTS}/font-contains ]; then
	echo "cannot read font-contains"
	exit 1
fi

# First get the filename, it will be used when writing the codepoints
# and coverage files.
FILENAME=$(echo $1 | sed 's%.*/%%') # for the ttf or otf
FILETYPE=$(echo $FILENAME | sed 's/.*\.\(.*\)/\1/')
# reduce it to the name part, without -Regular or an ending -R before the extension
# if somebody wanted to investigate only bold or italic or book fonts, this could
# be extended, but for me that is not worth the time.
# similarly remove -medium for some japanese fonts.
FILE=$(echo $FILENAME | sed 's/\..tf//' | sed -e 's/-Regular//' -e 's/-R$//' -e 's/-medium//')

if [ "$FILETYPE" != "otf" ] && [ "$FILETYPE" != "ttf" ]; then
	# report the name, it might not have an extension
	echo "unexpected file typei for $FILENAME"
	exit 1
fi

# Now, find the name by which fontconfig knows it.  Because I use this
# on ttc files (well, two of them) and ttf2config.pl cannot read those,
# the ttc is known to fontconfig but the single-face ttf is not.  So begin
# by testing for those.
check_if_ttc
if [ $? -ne 0 ]; then
	# Not a ttc, so look for it in fc-list
	fc-list | grep -q $FILENAME
	if [ $? -ne 0 ]; then
		echo "Error: cannot find $1 in fc-list"
		exit 1
	fi
	# A font may have several entries, but by searching with the filename
	# we do not need to worry about that.  After the filename is a colon,
	# a space, the name, perhaps a comma and a name in a different script,
	# another colon, the Style and perhaps translations.
	# For some fonts,
	FONTNAME=$(fc-list | grep $FILENAME | cut -d ':' -f 2 |
	 sed -e 's/^ //' -e 's/,.*//' -e 's/:.*//' -e 's%\\%%')
fi
echo "files will be called $FILE"
echo "fontconfig knows this as $FONTNAME"

type -pa ttf2config.pl
if [ $? -ne 0 ]; then
	echo "FAILED: you need to install ttf2config.pli and its deps"
	exit 1
fi

type -pa get_codepoints
if [ $? -ne 0 ]; then
	echo "FAILED: you need to compile and install get_codepoints"
	exit 1
fi

echo "running ttf2config.pl"
ttf2config.pl $1  | grep UID= | cut -d ' ' -f 3 | cut -d "'" -f2 | \
 awk '{ print "0x" $1 }' | sort -g >${FILE}.dat1

# I prefer uppercase in the output, and get_codepoints still
# writes as U+ but sort -g will need 0x
echo "running get_codepoints"
get_codepoints "$FONTNAME" | tr 'U+abcdef' '0xABCDEF' >${FILE}.dat2

# now merge them and change back to U+ notation
sort -g -u ${FILE}.dat1 ${FILE}.dat2 | sed 's/0x/U+/' >${FILE}.codepoints

echo "creating coverage report $FILE.coverage"

${SCRIPTS}/font-contains ${FILE}.codepoints >${FILE}.coverage

# comment this deletion for debugging
rm ${FILE}.dat1 ${FILE}.dat2

