From 0f227e78349a5cdd7122e4d1aeb15912edf9ba6f Mon Sep 17 00:00:00 2001
From: Doug Felt <dougfelt@google.com>
Date: Tue, 3 Nov 2015 18:40:19 -0800
Subject: [PATCH] wip - update emoji tooling to support tr51 sequences

---
 Makefile                                 |   3 +-
 NotoColorEmoji.tmpl.ttx.tmpl             |  11 +++
 build_emoji_set.py                       |  73 ++++++++++++++
 generate_emoji_placeholders.py           |  95 ++++++++++++++++++
 third_party/color_emoji/add_glyphs.py    | 120 +++++++++++++++++++----
 third_party/color_emoji/emoji_builder.py |  32 ++++--
 6 files changed, 306 insertions(+), 28 deletions(-)
 create mode 100644 build_emoji_set.py
 create mode 100644 generate_emoji_placeholders.py
diff --git a/Makefile b/Makefile
index 9217cc4e2..74cd544c2 100644
--- a/Makefile
+++ b/Makefile
@@ -78,7 +78,8 @@ flag-symlinks: $(WAVED_FLAGS)
 
 $(PNG128_FLAGS): flag-symlinks
 
-EMOJI_PNG128 = ./png/128/emoji_u
+#EMOJI_PNG128 = ./png/128/emoji_u
+EMOJI_PNG128 = /tmp/placeholder_emoji_plus/emoji_u
 
 EMOJI_BUILDER = third_party/color_emoji/emoji_builder.py
 ADD_GLYPHS = third_party/color_emoji/add_glyphs.py
diff --git a/NotoColorEmoji.tmpl.ttx.tmpl b/NotoColorEmoji.tmpl.ttx.tmpl
index 64461de00..f982d063c 100644
--- a/NotoColorEmoji.tmpl.ttx.tmpl
+++ b/NotoColorEmoji.tmpl.ttx.tmpl
@@ -4,6 +4,10 @@
   <GlyphOrder>
     <!-- The 'id' attribute is only for humans; it is ignored when parsed. -->
     <GlyphID id="0" name=".notdef"/>
+    <GlyphID id="1" name="null"/>
+    <GlyphID id="2" name="nonmarkingreturn"/>
+    <GlyphID id="3" name="space"/>
+    <GlyphID id="4" name="u200D"/>
   </GlyphOrder>
 
   <head>
@@ -119,12 +123,19 @@
 
   <hmtx>
     <mtx name=".notdef" width="2550" lsb="0"/>
+    <mtx name="null" width="0" lsb="0"/>
+    <mtx name="nonmarkingreturn" width="2550" lsb="0"/>
+    <mtx name="space" width="2550" lsb="0"/>
+    <mtx name="u200D" width="0" lsb="0"/>
   </hmtx>
 
   <cmap>
     <tableVersion version="0"/>
     <cmap_format_12 platformID="3" platEncID="10" language="0" format="12" reserved="0" length="1" nGroups="1">
       <map code="0x0" name=".notdef"/><!-- &lt;control> -->
+      <map code="0xd" name="nonmarkingreturn"/>
+      <map code="0x20" name="space"/>
+      <map code="0x200d" name="u200D"/>
     </cmap_format_12>
   </cmap>
 
diff --git a/build_emoji_set.py b/build_emoji_set.py
new file mode 100644
index 000000000..3f0708bfe
--- /dev/null
+++ b/build_emoji_set.py
@@ -0,0 +1,73 @@
+# delete dst, then:
+# copy the placeholders to dst
+# then copy the noto images to dst
+# then copy the draft images to dst, skipping names with parens and
+# after fixing the case of the names
+
+import glob
+import os
+from os import path
+import re
+import shutil
+
+DST = "/tmp/placeholder_emoji_plus"
+
+SRC_PLACEHOLDER = "/tmp/placeholder_emoji"
+SRC_NOTO = "/usr/local/google/users/dougfelt/newnoto/noto-emoji/png/128"
+SRC_DRAFT = "/usr/local/google/home/dougfelt/Downloads/PNG_latest_working_draft"
+
+# First, scan the draft images and select which ones to use.  This does
+# two things:
+# - The download package returns all the images, including previous versions.
+#   Ensure we use the one with the highest version.
+# - The names often mix case.  Make sure we have all lower case names.
+#
+# If something seems amiss, we fail.
+
+UPDATED_NAMES = {}
+FIXED_NAMES = {}
+VAR_PAT = re.compile(r'(.*?)\((\d+)\)\.png')
+for fname in glob.glob(path.join(SRC_DRAFT, '*.png')):
+  name = path.basename(fname)
+  m = VAR_PAT.match(name)
+  if m:
+    name = '%s.png' % m.group(1).lower()
+    version = int(m.group(2))
+    if version > UPDATED_NAMES.get(name, (0, None))[0]:
+      print 'update %s to version %d' % (name, version)
+      UPDATED_NAMES[name] = (version, fname)
+  else:
+    name = name.lower()
+    FIXED_NAMES[name] = fname
+
+for name in UPDATED_NAMES:
+  if name not in FIXED_NAMES:
+    raise Exception('updated name %s not in names' % name)
+  fname = UPDATED_NAMES[name][1]
+  print 'using updated image %s for %s' % (fname, name)
+  FIXED_NAMES[name] = fname
+
+# Now, recreate the destination directory and copy the data into it.
+
+if path.isdir(DST):
+  shutil.rmtree(DST)
+os.makedirs(DST)
+
+SKIP_PLACEHOLDERS = frozenset([
+  'emoji_u1f468_200d_1f469_200d_1f466.png',
+  'emoji_u1f469_200d_2764_fe0f_200d_1f468.png',
+  'emoji_u1f469_200d_2764_fe0f_200d_1f48b_200d_1f468.png',
+])
+
+for fname in glob.glob(path.join(SRC_PLACEHOLDER, '*.png')):
+  basename = path.basename(fname)
+  if basename in SKIP_PLACEHOLDERS:
+    print 'skip %s' % basename
+    continue
+  shutil.copy(fname, DST)
+
+for fname in glob.glob(path.join(SRC_NOTO, '*.png')):
+  shutil.copy(fname, DST)
+
+for name, fname in FIXED_NAMES.iteritems():
+  shutil.copy(fname, path.join(DST, name))
diff --git a/generate_emoji_placeholders.py b/generate_emoji_placeholders.py
new file mode 100644
index 000000000..48b508231
--- /dev/null
+++ b/generate_emoji_placeholders.py
@@ -0,0 +1,95 @@
+import os
+from os import path
+import subprocess
+
+OUTPUT_DIR = '/tmp/placeholder_emoji'
+
+def generate_image(name, text):
+  print name, text.replace('\n', '_')
+  subprocess.check_call(
+      ['convert', '-size', '100x100', 'label:%s' % text,
+       '%s/%s' % (OUTPUT_DIR, name)])
+
+def is_color_patch(cp):
+  return cp >= 0x1f3fb and cp <= 0x1f3ff
+
+def has_color_patch(values):
+  for v in values:
+    if is_color_patch(v):
+      return True
+  return False
+
+def regional_to_ascii(cp):
+  return unichr(ord('A') + cp - 0x1f1e6)
+
+def is_flag_sequence(values):
+  if len(values) != 2:
+    return False
+  for v in values:
+    v -= 0x1f1e6
+    if v < 0 or v > 25:
+      return False
+  return True
+
+def is_keycap_sequence(values):
+  return len(values) == 2 and values[1] == 0x20e3
+
+def get_keycap_text(values):
+  return '-%c-' % unichr(values[0]) # convert gags on '['
+
+char_map = {
+    0x1f468: 'M',
+    0x1f469: 'W',
+    0x1f466: 'B',
+    0x1f467: 'G',
+    0x2764: 'H', # heavy black heart, no var sel
+    0x1f48b: 'K', # kiss mark
+    0x200D: '-', # zwj placeholder
+    0xfe0f: '-', # variation selector placeholder
+    0x1f441: 'I', # Eye
+    0x1f5e8: 'W', # 'witness' (left speech bubble)
+}
+
+def get_combining_text(values):
+  chars = []
+  for v in values:
+    char = char_map.get(v, None)
+    if not char:
+      return None
+    if char != '-':
+      chars.append(char)
+  return ''.join(chars)
+
+
+if not path.isdir(OUTPUT_DIR):
+  os.makedirs(OUTPUT_DIR)
+
+with open('sequences.txt', 'r') as f:
+  for seq in f:
+    seq = seq.strip()
+    text = None
+    values = [int(code, 16) for code in seq.split('_')]
+    if len(values) == 1:
+      val = values[0]
+      text = '%04X' % val # ensure upper case format
+    elif is_flag_sequence(values):
+      text = ''.join(regional_to_ascii(cp) for cp in values)
+    elif has_color_patch(values):
+      print 'skipping color patch sequence %s' % seq
+    elif is_keycap_sequence(values):
+      text = get_keycap_text(values)
+    else:
+      text = get_combining_text(values)
+      if not text:
+        print 'missing %s' % seq
+
+    if text:
+      if len(text) > 3:
+        if len(text) == 4:
+          hi = text[:2]
+          lo = text[2:]
+        else:
+          hi = text[:-3]
+          lo = text[-3:]
+        text = '%s\n%s' % (hi, lo)
+      generate_image('emoji_u%s.png' % seq, text)
diff --git a/third_party/color_emoji/add_glyphs.py b/third_party/color_emoji/add_glyphs.py
index a8d986547..bea880064 100644
--- a/third_party/color_emoji/add_glyphs.py
+++ b/third_party/color_emoji/add_glyphs.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-import glob, os, sys
+import collections, glob, os, sys
 from fontTools import ttx
 from fontTools.ttLib.tables import otTables
 from png import PNG
@@ -10,11 +10,31 @@ sys.path.append(
 import add_emoji_gsub
 
 
+def is_vs(cp):
+        return cp >= 0xfe00 and cp <= 0xfe0f
+
+def codes_to_string(codes):
+	if "_" in codes:
+		pieces = codes.split ("_")
+		string = "".join ([unichr (int (code, 16)) for code in pieces])
+	else:
+		string = unichr (int (codes, 16))
+        return string
+
+
+def glyph_sequence(string):
+        # sequence of names of glyphs that form a ligature
+        # variation selectors are stripped
+        return ["u%04X" % ord(char) for char in string if not is_vs(ord(char))]
+
+
 def glyph_name(string):
+        # name of a ligature
+        # includes variation selectors when present
 	return "_".join (["u%04X" % ord (char) for char in string])
 
 
-def add_ligature (font, string):
+def add_ligature (font, seq, name):
 	if 'GSUB' not in font:
 		ligature_subst = otTables.LigatureSubst()
 		ligature_subst.ligatures = {}
@@ -34,17 +54,27 @@ def add_ligature (font, string):
 	ligatures = lookup.SubTable[0].ligatures
 
 	lig = otTables.Ligature()
-	lig.CompCount = len(string)
-	lig.Component = [glyph_name(ch) for ch in string[1:]]
-	lig.LigGlyph = glyph_name(string)
+	lig.CompCount = len(seq)
+	lig.Component = seq[1:]
+	lig.LigGlyph = name
 
-	first = glyph_name(string[0])
+	first = seq[0]
 	try:
 		ligatures[first].append(lig)
 	except KeyError:
 		ligatures[first] = [lig]
 
 
+# Ligating sequences for emoji that already have a defined codepoint,
+# to match the sequences for the related emoji with no codepoint.
+# The key is the name of the glyph with the codepoint, the value is the
+# name of the sequence in filename form.
+EXTRA_SEQUENCES = {
+    'u1F46A': '1F468_200D_1F469_200D_1F466', # MWB
+    'u1F491': '1F469_200D_2764_FE0F_200D_1F468', # WHM
+    'u1F48F': '1F469_200D_2764_FE0F_200D_1F48B_200D_1F468', # WHKM
+}
+
 if len (sys.argv) < 4:
 	print >>sys.stderr, """
 Usage:
@@ -65,23 +95,22 @@ table and the first GSUB lookup (if existing) are modified.
 
 in_file = sys.argv[1]
 out_file = sys.argv[2]
-img_prefix = sys.argv[3]
+img_prefixen = sys.argv[3:]
 del sys.argv
 
 font = ttx.TTFont()
 font.importXML (in_file)
 
 img_files = {}
-glb = "%s*.png" % img_prefix
-print "Looking for images matching '%s'." % glb
-for img_file in glob.glob (glb):
-	codes = img_file[len (img_prefix):-4]
-	if "_" in codes:
-		pieces = codes.split ("_")
-		u = "".join ([unichr (int (code, 16)) for code in pieces])
-	else:
-		u = unichr (int (codes, 16))
-	img_files[u] = img_file
+for img_prefix in img_prefixen:
+        glb = "%s*.png" % img_prefix
+        print "Looking for images matching '%s'." % glb
+        for img_file in glob.glob (glb):
+        	codes = img_file[len (img_prefix):-4]
+                u = codes_to_string(codes)
+                if u in img_files:
+                        print 'overwriting %s with %s' % (img_files[u], imag_file)
+        	img_files[u] = img_file
 if not img_files:
 	raise Exception ("No image files found in '%s'." % glb)
 
@@ -98,20 +127,71 @@ h = font['hmtx'].metrics
 img_pairs = img_files.items ()
 img_pairs.sort (key=lambda pair: (len (pair[0]), pair[0]))
 
+glyph_names = set()
+ligatures = {}
+
+def add_lig_sequence(ligatures, seq, n):
+        # Assume sequences with ZWJ are emoji 'ligatures' and rtl order
+        # is also valid.  Internal permutations, though, no.
+        # We associate a sequence with a filename.  We can overwrite the
+        # sequence with a different filename later.
+        tseq = tuple(seq)
+        if tseq in ligatures:
+                print 'lig sequence %s, replace %s with %s' % (
+                    tseq, ligatures[tseq], n)
+        ligatures[tseq] = n
+        if 'u200D' in seq:
+                rev_seq = seq[:]
+                rev_seq.reverse()
+                trseq = tuple(rev_seq)
+                if trseq in ligatures:
+                        print 'rev lig sequence %s, replace %s with %s' % (
+                            trseq, ligatures[trseq], n)
+                ligatures[trseq] = n
+
+
 for (u, filename) in img_pairs:
 	print "Adding glyph for U+%s" % ",".join (["%04X" % ord (char) for char in u])
 	n = glyph_name (u)
+        glyph_names.add(n)
+
 	g.append (n)
 	for char in u:
-		if char not in c:
+                cp = ord(char)
+		if cp not in c and not is_vs(cp):
 			name = glyph_name (char)
-			c[ord (char)] = name
+			c[cp] = name
 			if len (u) > 1:
 				h[name] = [0, 0]
 	(img_width, img_height) = PNG (filename).get_size ()
 	advance = int (round ((float (ascent+descent) * img_width / img_height)))
 	h[n] = [advance, 0]
 	if len (u) > 1:
-		add_ligature (font, u)
+                seq = glyph_sequence(u)
+                add_lig_sequence(ligatures, seq, n)
+
+for n in EXTRA_SEQUENCES:
+        if n in glyph_names:
+                seq = glyph_sequence(codes_to_string(EXTRA_SEQUENCES[n]))
+                add_lig_sequence(ligatures, seq, n)
+        else:
+                print 'extras: no glyph for %s' % n
+
+
+keyed_ligatures = collections.defaultdict(list)
+for k, v in ligatures.iteritems():
+        first = k[0]
+        keyed_ligatures[first].append((k, v))
+
+for base in sorted(keyed_ligatures):
+        pairs = keyed_ligatures[base]
+        print 'base %s has %d sequences' % (base, len(pairs))
+        # Sort longest first, this ensures longer sequences with common prefixes
+        # are handled before shorter ones.  It would be better to have multiple
+        # lookups, most likely.
+        pairs.sort(key = lambda pair: (len(pair[0]), pair[0]), reverse=True)
+        for seq, name in pairs:
+                print seq, name
+                add_ligature(font, seq, name)
 
 font.saveXML (out_file)
diff --git a/third_party/color_emoji/emoji_builder.py b/third_party/color_emoji/emoji_builder.py
index 5a4e646fe..844102ab5 100644
--- a/third_party/color_emoji/emoji_builder.py
+++ b/third_party/color_emoji/emoji_builder.py
@@ -20,7 +20,8 @@
 
 import sys, struct, StringIO
 from png import PNG
-
+import os
+from os import path
 
 def get_glyph_name_from_gsub (string, font, cmap_dict):
 	ligatures = font['GSUB'].table.LookupList.Lookup[0].SubTable[0].ligatures
@@ -83,6 +84,7 @@ class CBDT:
 		write_func = self.image_write_func (image_format)
 		for glyph in glyphs:
 			img_file = glyph_filenames[glyph]
+                        print 'writing data for glyph %s' % path.basename(img_file)
 			offset = self.tell ()
 			write_func (PNG (img_file))
 			self.glyph_maps.append (GlyphMap (glyph, offset, image_format))
@@ -108,6 +110,7 @@ class CBDT:
 		line_ascent = ascent * y_ppem / float (upem)
 		y_bearing = int (round (line_ascent - .5 * (line_height - height)))
 		advance = width
+                print "small glyph metrics h: %d w: %d a: %d" % (height, width, advance)
 		# smallGlyphMetrics
 		# Type	Name
 		# BYTE	height
@@ -115,10 +118,14 @@ class CBDT:
 		# CHAR	BearingX
 		# CHAR	BearingY
 		# BYTE	Advance
-		self.write (struct.pack ("BBbbB",
+                try:
+                        self.write (struct.pack ("BBbbB",
 					 height, width,
 					 x_bearing, y_bearing,
 					 advance))
+                except:
+                  raise ValueError("h: %d w: %d a: %d x: %d y: 5d" % (
+                      height, width, advance, x_braring, y_bearing))
 
 	def write_format1 (self, png):
 
@@ -437,8 +444,10 @@ By default they are dropped.
 	eblc.write_header ()
 	eblc.start_strikes (len (img_prefixes))
 
-	for img_prefix in img_prefixes:
+        def is_vs(cp):
+                return cp >= 0xfe00 and cp <= 0xfe0f
 
+	for img_prefix in img_prefixes:
 		print
 
 		img_files = {}
@@ -448,9 +457,14 @@ By default they are dropped.
 			codes = img_file[len (img_prefix):-4]
 			if "_" in codes:
 				pieces = codes.split ("_")
-				uchars = "".join ([unichr (int (code, 16)) for code in pieces])
+                                cps = [int(code, 16) for code in pieces]
+				uchars = "".join ([unichr(cp) for cp in cps if not is_vs(cp)])
 			else:
-				uchars = unichr (int (codes, 16))
+                                cp = int(codes, 16)
+                                if is_vs(cp):
+                                        print "ignoring unexpected vs input %04x" % cp
+                                        continue
+				uchars = unichr(cp)
 			img_files[uchars] = img_file
 		if not img_files:
 			raise Exception ("No image files found in '%s'." % glb)
@@ -460,7 +474,11 @@ By default they are dropped.
 		advance = width = height = 0
 		for uchars, img_file in img_files.items ():
 			if len (uchars) == 1:
-				glyph_name = unicode_cmap.cmap[ord (uchars)]
+                                try:
+                                        glyph_name = unicode_cmap.cmap[ord (uchars)]
+                                except:
+                                        print "no cmap entry for %x" % ord(uchars)
+                                        raise ValueError("%x" % ord(uchars))
 			else:
 				glyph_name = get_glyph_name_from_gsub (uchars, font, unicode_cmap.cmap)
 			glyph_id = font.getGlyphID (glyph_name)
@@ -476,7 +494,7 @@ By default they are dropped.
 
 		glyphs = sorted (glyph_imgs.keys ())
 		if not glyphs:
-			raise Exception ("No common characteres found between font and '%s'." % glb)
+			raise Exception ("No common characters found between font and '%s'." % glb)
 		print "Embedding images for %d glyphs for this strike." % len (glyphs)
 
 		advance, width, height = (div (x, len (glyphs)) for x in (advance, width, height))