From b99cd4b22b94ff7653624c2c14a9fed86c74a6b2 Mon Sep 17 00:00:00 2001
From: Doug Felt <dougfelt@google.com>
Date: Tue, 24 Jan 2017 11:40:13 -0800
Subject: [PATCH] Enhance check_emoji_sequences tool.

- includes aliases
- checks coverage of sequences (assumes full coverage of all unicode
  emoji and sequences for now)
- reports sequence names

(Some of this code needs to be shuffled into other places, sequence name
lookup and emoji_vs stripping doesn't belong here since these operations
are more generally useful.  That will come.)
---
 check_emoji_sequences.py | 178 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 155 insertions(+), 23 deletions(-)

diff --git a/check_emoji_sequences.py b/check_emoji_sequences.py
index 1278ed5ab..d2147e5f6 100755
--- a/check_emoji_sequences.py
+++ b/check_emoji_sequences.py
@@ -26,6 +26,8 @@ import sys
 
 from nototools import unicode_data
 
+DATA_ROOT = path.dirname(path.abspath(__file__))
+
 ZWJ = 0x200d
 EMOJI_VS = 0xfe0f
 
@@ -40,6 +42,40 @@ def _is_skintone_modifier(cp):
 def _seq_string(seq):
   return '_'.join('%04x' % cp for cp in seq)
 
+def strip_vs(seq):
+  return tuple(cp for cp in seq if cp != EMOJI_VS)
+
+_namedata = None
+
+def seq_name(seq):
+  global _namedata
+
+  if not _namedata:
+    def strip_vs_map(seq_map):
+      return {
+          strip_vs(k): v
+          for k, v in seq_map.iteritems()}
+    _namedata = [
+        strip_vs_map(unicode_data.get_emoji_combining_sequences()),
+        strip_vs_map(unicode_data.get_emoji_flag_sequences()),
+        strip_vs_map(unicode_data.get_emoji_modifier_sequences()),
+        strip_vs_map(unicode_data.get_emoji_zwj_sequences()),
+        ]
+
+  if len(seq) == 1:
+    return unicode_data.name(seq[0], None)
+
+  for data in _namedata:
+    if seq in data:
+      return data[seq]
+  if EMOJI_VS in seq:
+    non_vs_seq = strip_vs(seq)
+    for data in _namedata:
+      if non_vs_seq in data:
+        return data[non_vs_seq]
+
+  return None
+
 
 def _check_valid_emoji(sorted_seq_to_filepath):
   """Ensure all emoji are either valid emoji or specific chars."""
@@ -138,38 +174,134 @@ def _check_skintone(sorted_seq_to_filepath):
 
 def _check_zwj_sequences(seq_to_filepath):
   """Verify that zwj sequences are valid."""
-  zwj_sequence_to_type = unicode_data.get_emoji_zwj_sequences()
-  # strip emoji variant selectors and add these back in
-  zwj_sequence_without_vs_to_type_canonical = {}
-  for seq, seq_type in zwj_sequence_to_type.iteritems():
+  zwj_sequence_to_name = unicode_data.get_emoji_zwj_sequences()
+  # strip emoji variant selectors and add extra mappings
+  zwj_sequence_without_vs_to_name_canonical = {}
+  for seq, seq_name in zwj_sequence_to_name.iteritems():
     if EMOJI_VS in seq:
-      stripped_seq = tuple(s for s in seq if s != EMOJI_VS)
-      zwj_sequence_without_vs_to_type_canonical[stripped_seq] = (seq_type, seq)
+      stripped_seq = strip_vs(seq)
+      zwj_sequence_without_vs_to_name_canonical[stripped_seq] = (seq_name, seq)
 
   zwj_seq_to_filepath = {
       seq: fp for seq, fp in seq_to_filepath.iteritems()
       if ZWJ in seq}
 
   for seq, fp in zwj_seq_to_filepath.iteritems():
-    if seq not in zwj_sequence_to_type:
-      if seq not in zwj_sequence_without_vs_to_type_canonical:
+    if seq not in zwj_sequence_to_name:
+      if seq not in zwj_sequence_without_vs_to_name_canonical:
         print >> sys.stderr, 'zwj sequence not defined: %s' % fp
       else:
-        _, can = zwj_sequence_without_vs_to_type_canonical[seq]
-        print >> sys.stderr, 'canonical sequence %s contains vs: %s' % (
-            _seq_string(can), fp)
+        _, can = zwj_sequence_without_vs_to_name_canonical[seq]
+        # print >> sys.stderr, 'canonical sequence %s contains vs: %s' % (
+        #     _seq_string(can), fp)
 
-  # check that all zwj sequences are covered
-  for seq in zwj_seq_to_filepath:
-    if seq in zwj_sequence_to_type:
-      del zwj_sequence_to_type[seq]
-    elif seq in zwj_sequence_without_vs_to_type_canonical:
-      canon_seq = zwj_sequence_without_vs_to_type_canonical[seq][1]
-      del zwj_sequence_to_type[canon_seq]
-  if zwj_sequence_to_type:
-    print >> sys.stderr, 'missing %d zwj sequences' % len(zwj_sequence_to_type)
-    for seq, seq_type in sorted(zwj_sequence_to_type.items()):
-      print >> sys.stderr, '  %s: %s' % (_seq_string(seq), seq_type)
+def read_emoji_aliases():
+  result = {}
+
+  with open(path.join(DATA_ROOT, 'emoji_aliases.txt'), 'r') as f:
+    for line in f:
+      ix = line.find('#')
+      if (ix > -1):
+        line = line[:ix]
+      line = line.strip()
+      if not line:
+        continue
+      als, trg = (s.strip() for s in line.split(';'))
+      als_seq = tuple([int(x, 16) for x in als.split('_')])
+      try:
+        trg_seq = tuple([int(x, 16) for x in trg.split('_')])
+      except:
+        print 'cannot process alias %s -> %s' % (als, trg)
+        continue
+      result[als_seq] = trg_seq
+  return result
+
+
+def _check_coverage(seq_to_filepath):
+  age = 9.0
+
+  non_vs_to_canonical = {}
+  for k in seq_to_filepath:
+    if EMOJI_VS in k:
+      non_vs = strip_vs(k)
+      non_vs_to_canonical[non_vs] = k
+
+  aliases = read_emoji_aliases()
+  for k, v in sorted(aliases.items()):
+    if v not in seq_to_filepath and v not in non_vs_to_canonical:
+      print 'alias %s missing target %s' % (_seq_string(k), _seq_string(v))
+      continue
+    if k in seq_to_filepath or k in non_vs_to_canonical:
+      print 'alias %s already exists as %s (%s)' % (
+          _seq_string(k), _seq_string(v), seq_name(v))
+      continue
+    filename = seq_to_filepath.get(v) or seq_to_filepath[non_vs_to_canonical[v]]
+    seq_to_filepath[k] = 'alias:' + filename
+
+  # check single emoji, this includes most of the special chars
+  emoji = sorted(unicode_data.get_emoji(age=age))
+  for cp in emoji:
+    if tuple([cp]) not in seq_to_filepath:
+      print 'missing single %04x (%s)' % (cp, unicode_data.name(cp, '<no name>'))
+
+  # special characters
+  # all but combining enclosing keycap are currently marked as emoji
+  for cp in [ord('*'), ord('#'), ord(u'\u20e3')] + range(0x30, 0x3a):
+    if cp not in emoji and tuple([cp]) not in seq_to_filepath:
+      print 'missing special %04x (%s)' % (cp, unicode_data.name(cp))
+
+  # combining sequences
+  comb_seq_to_name = sorted(
+      unicode_data.get_emoji_combining_sequences(age=age).iteritems())
+  for seq, name in comb_seq_to_name:
+    if seq not in seq_to_filepath:
+      # strip vs and try again
+      non_vs_seq = strip_vs(seq)
+      if non_vs_seq not in seq_to_filepath:
+        print 'missing combining sequence %s (%s)' % (_seq_string(seq), name)
+
+  # flag sequences
+  flag_seq_to_name = sorted(
+      unicode_data.get_emoji_flag_sequences(age=age).iteritems())
+  for seq, name in flag_seq_to_name:
+    if seq not in seq_to_filepath:
+      print 'missing flag sequence %s (%s)' % (_seq_string(seq), name)
+
+  # skin tone modifier sequences
+  mod_seq_to_name = sorted(
+      unicode_data.get_emoji_modifier_sequences(age=age).iteritems())
+  for seq, name in mod_seq_to_name:
+    if seq not in seq_to_filepath:
+      print 'missing modifier sequence %s (%s)' % (
+          _seq_string(seq), name)
+
+  # zwj sequences
+  # some of ours include the emoji presentation variation selector and some
+  # don't, and the same is true for the canonical sequences.  normalize all
+  # of them to omit it to test coverage, but report the canonical sequence.
+  zwj_seq_without_vs = set()
+  for seq in seq_to_filepath:
+    if ZWJ not in seq:
+      continue
+    if EMOJI_VS in seq:
+      seq = tuple(cp for cp in seq if cp != EMOJI_VS)
+    zwj_seq_without_vs.add(seq)
+
+  for seq, name in sorted(
+      unicode_data.get_emoji_zwj_sequences(age=age).iteritems()):
+    if EMOJI_VS in seq:
+      test_seq = tuple(s for s in seq if s != EMOJI_VS)
+    else:
+      test_seq = seq
+    if test_seq not in zwj_seq_without_vs:
+      print 'missing (canonical) zwj sequence %s (%s)' % (
+          _seq_string(seq), name)
+
+  # check for 'unknown flag'
+  # this is either emoji_ufe82b or 'unknown_flag', we filter out things that
+  # don't start with our prefix so 'unknown_flag' would be excluded by default.
+  if tuple([0xfe82b]) not in seq_to_filepath:
+    print 'missing unknown flag PUA fe82b'
 
 
 def check_sequence_to_filepath(seq_to_filepath):
@@ -180,7 +312,7 @@ def check_sequence_to_filepath(seq_to_filepath):
   _check_flags(sorted_seq_to_filepath)
   _check_skintone(sorted_seq_to_filepath)
   _check_zwj_sequences(sorted_seq_to_filepath)
-
+  _check_coverage(sorted_seq_to_filepath)
 
 def create_sequence_to_filepath(name_to_dirpath, prefix, suffix):
   """Check names, and convert name to sequences for names that are ok,