From 5bec5cce2c0dd7318a1adf9dd69aa3cab88680b9 Mon Sep 17 00:00:00 2001
From: Doug Felt <dougfelt@users.noreply.github.com>
Date: Mon, 16 Apr 2018 18:25:58 -0700
Subject: [PATCH] Update check_emoji_sequences.

- use existing utilities in nototools/unicode_data, add_aliases
- add check that file names do not use presentation selectors
- include tags in valid cps that can appear in a sequence
- add check for valid tag sequences (for subregion flags)
- separate out check that no source for an alias is present (we
  expect to alias this so should not have an image with that name)
- filter data by age (somewhat), provide command line flag, remove
  hard-coded unicode 9.0 value
- separate coverage check (for when data is partial), provide
  command line flag and don't run by default
- provide command line flag to exclude subdirs by name when collecting
  images
- refactor output so each error has a consistent initial text indicating
  the check where the error was found, make output a tad less verbose
---
 check_emoji_sequences.py | 307 +++++++++++++++++++++++----------------
 1 file changed, 182 insertions(+), 125 deletions(-)

diff --git a/check_emoji_sequences.py b/check_emoji_sequences.py
index 083e0d5a8..f29bbe90b 100755
--- a/check_emoji_sequences.py
+++ b/check_emoji_sequences.py
@@ -26,25 +26,21 @@ import re
 import sys
 
 from nototools import unicode_data
-
-DATA_ROOT = path.dirname(path.abspath(__file__))
+import add_aliases
 
 ZWJ = 0x200d
 EMOJI_VS = 0xfe0f
 
-def _is_regional_indicator(cp):
-  return 0x1f1e6 <= cp <= 0x1f1ff
+END_TAG = 0xe007f
 
+def _make_tag_set():
+  tag_set = set()
+  tag_set |= set(range(0xe0030, 0xe003a))  # 0-9
+  tag_set |= set(range(0xe0061, 0xe007b))  # a-z
+  tag_set.add(END_TAG)
+  return tag_set
 
-def _is_skintone_modifier(cp):
-  return 0x1f3fb <= cp <= 0x1f3ff
-
-
-def _seq_string(seq):
-  return '_'.join('%04x' % cp for cp in seq)
-
-def strip_vs(seq):
-  return tuple(cp for cp in seq if cp != EMOJI_VS)
+TAG_SET = _make_tag_set()
 
 _namedata = None
 
@@ -54,7 +50,7 @@ def seq_name(seq):
   if not _namedata:
     def strip_vs_map(seq_map):
       return {
-          strip_vs(k): v
+          unicode_data.strip_emoji_vs(k): v
           for k, v in seq_map.iteritems()}
     _namedata = [
         strip_vs_map(unicode_data.get_emoji_combining_sequences()),
@@ -70,7 +66,7 @@ def seq_name(seq):
     if seq in data:
       return data[seq]
   if EMOJI_VS in seq:
-    non_vs_seq = strip_vs(seq)
+    non_vs_seq = unicode_data.strip_emoji_vs(seq)
     for data in _namedata:
       if non_vs_seq in data:
         return data[non_vs_seq]
@@ -78,14 +74,29 @@ def seq_name(seq):
   return None
 
 
-def _check_valid_emoji(sorted_seq_to_filepath):
-  """Ensure all emoji are either valid emoji or specific chars."""
+def _check_no_vs(sorted_seq_to_filepath):
+  """Our image data does not use emoji presentation variation selectors."""
+  for seq, fp in sorted_seq_to_filepath.iteritems():
+    if EMOJI_VS in seq:
+      print('check no VS: FE0F in path: %s' % fp)
 
-  valid_cps = set(unicode_data.get_emoji() | unicode_data.proposed_emoji_cps())
+
+def _check_valid_emoji_cps(sorted_seq_to_filepath, unicode_version):
+  """Ensure all cps in these sequences are valid emoji cps or specific cps
+  used in forming emoji sequences.  This is a 'pre-check' that reports
+  this specific problem."""
+
+  valid_cps = set(unicode_data.get_emoji())
+  if unicode_version is None or unicode_version >= unicode_data.PROPOSED_EMOJI_AGE:
+    valid_cps |= unicode_data.proposed_emoji_cps()
+  else:
+    valid_cps = set(
+        cp for cp in valid_cps if unicode_data.age(cp) <= unicode_version)
   valid_cps.add(0x200d)  # ZWJ
   valid_cps.add(0x20e3)  # combining enclosing keycap
   valid_cps.add(0xfe0f)  # variation selector (emoji presentation)
   valid_cps.add(0xfe82b)  # PUA value for unknown flag
+  valid_cps |= TAG_SET  # used in subregion tag sequences
 
   not_emoji = {}
   for seq, fp in sorted_seq_to_filepath.iteritems():
@@ -96,35 +107,43 @@ def _check_valid_emoji(sorted_seq_to_filepath):
         not_emoji[cp].append(fp)
 
   if len(not_emoji):
-    print('%d non-emoji found:' % len(not_emoji), file=sys.stderr)
+    print(
+        'check valid emoji cps: %d non-emoji cp found' % len(not_emoji),
+        file=sys.stderr)
     for cp in sorted(not_emoji):
-      print('%04x (in %s)' % (cp, ', '.join(not_emoji[cp])), file=sys.stderr)
+      fps = not_emoji[cp]
+      print(
+          'check valid emoji cps: %04x (in %d sequences)' % (cp, len(fps)),
+          file=sys.stderr)
 
 
 def _check_zwj(sorted_seq_to_filepath):
-  """Ensure zwj is only between two appropriate emoji."""
-  ZWJ = 0x200D
-  EMOJI_PRESENTATION_VS = 0xFE0F
+  """Ensure zwj is only between two appropriate emoji.  This is a 'pre-check'
+  that reports this specific problem."""
 
   for seq, fp in sorted_seq_to_filepath.iteritems():
     if ZWJ not in seq:
       continue
-    if seq[0] == 0x200d:
-      print('zwj at head of sequence in %s' % fp, file=sys.stderr)
+    if seq[0] == ZWJ:
+      print('check zwj: zwj at head of sequence in %s' % fp, file=sys.stderr)
     if len(seq) == 1:
       continue
-    if seq[-1] == 0x200d:
-      print('zwj at end of sequence in %s' % fp, file=sys.stderr)
+    if seq[-1] == ZWJ:
+      print('check zwj: zwj at end of sequence in %s' % fp, file=sys.stderr)
     for i, cp in enumerate(seq):
       if cp == ZWJ:
         if i > 0:
           pcp = seq[i-1]
-          if pcp != EMOJI_PRESENTATION_VS and not unicode_data.is_emoji(pcp):
-            print('non-emoji %04x preceeds ZWJ in %s' % (pcp, fp), file=sys.stderr)
+          if pcp != EMOJI_VS and not unicode_data.is_emoji(pcp):
+            print(
+                'check zwj: non-emoji %04x preceeds ZWJ in %s' % (pcp, fp),
+                file=sys.stderr)
         if i < len(seq) - 1:
           fcp = seq[i+1]
           if not unicode_data.is_emoji(fcp):
-            print('non-emoji %04x follows ZWJ in %s' % (fcp, fp), file=sys.stderr)
+            print(
+                'check zwj: non-emoji %04x follows ZWJ in %s' % (fcp, fp),
+                file=sys.stderr)
 
 
 def _check_flags(sorted_seq_to_filepath):
@@ -133,15 +152,40 @@ def _check_flags(sorted_seq_to_filepath):
   for seq, fp in sorted_seq_to_filepath.iteritems():
     have_reg = None
     for cp in seq:
-      is_reg = _is_regional_indicator(cp)
+      is_reg = unicode_data.is_regional_indicator(cp)
       if have_reg == None:
         have_reg = is_reg
       elif have_reg != is_reg:
-        print('mix of regional and non-regional in %s' % fp, file=sys.stderr)
+        print(
+            'check flags: mix of regional and non-regional in %s' % fp,
+            file=sys.stderr)
     if have_reg and len(seq) > 2:
       # We provide dummy glyphs for regional indicators, so there are sequences
-      # with single regional indicator symbols.
-      print('regional indicator sequence length != 2 in %s' % fp, file=sys.stderr)
+      # with single regional indicator symbols, the len check handles this.
+      print(
+          'check flags: regional indicator sequence length != 2 in %s' % fp,
+          file=sys.stderr)
+
+def _check_tags(sorted_seq_to_filepath):
+  """Ensure tag sequences (for subregion flags) conform to the spec.  We don't
+  validate against CLDR, just that there's a sequence of 2 or more tags starting
+  and ending with the appropriate codepoints."""
+
+  BLACK_FLAG = 0x1f3f4
+  BLACK_FLAG_SET = set([BLACK_FLAG])
+  for seq, fp in sorted_seq_to_filepath.iteritems():
+    seq_set = set(cp for cp in seq)
+    overlap_set = seq_set & TAG_SET
+    if not overlap_set:
+      continue
+    if seq[0] != BLACK_FLAG:
+      print('check tags: bad start tag in %s' % fp)
+    elif seq[-1] != END_TAG:
+      print('check tags: bad end tag in %s' % fp)
+    elif len(seq) < 4:
+      print('check tags: sequence too short in %s' % fp)
+    elif seq_set - TAG_SET != BLACK_FLAG_SET:
+      print('check tags: non-tag items in %s' % fp)
 
 
 def _check_skintone(sorted_seq_to_filepath):
@@ -151,90 +195,76 @@ def _check_skintone(sorted_seq_to_filepath):
   base_to_modifiers = collections.defaultdict(set)
   for seq, fp in sorted_seq_to_filepath.iteritems():
     for i, cp in enumerate(seq):
-      if _is_skintone_modifier(cp):
+      if unicode_data.is_skintone_modifier(cp):
         if i == 0:
           if len(seq) > 1:
-            print('skin color selector first in sequence %s' % fp, file=sys.stderr)
+            print(
+                'check skintone: skin color selector first in sequence %s' % fp,
+                file=sys.stderr)
           # standalone are ok
           continue
         pcp = seq[i-1]
         if not unicode_data.is_emoji_modifier_base(pcp):
-          print((
-              'emoji skintone modifier applied to non-base at %d: %s' % (i, fp)), file=sys.stderr)
-      elif unicode_data.is_emoji_modifier_base(cp):
-        if i < len(seq) - 1 and _is_skintone_modifier(seq[i+1]):
-          base_to_modifiers[cp].add(seq[i+1])
-        elif cp not in base_to_modifiers:
-          base_to_modifiers[cp] = set()
+          print(
+              'check skintone: emoji skintone modifier applied to non-base ' +
+              'at %d: %s' % (i, fp), file=sys.stderr)
+        else:
+          if pcp not in base_to_modifiers:
+            base_to_modifiers[pcp] = set()
+          base_to_modifiers[pcp].add(cp)
+
   for cp, modifiers in sorted(base_to_modifiers.iteritems()):
     if len(modifiers) != 5:
-      print('emoji base %04x has %d modifiers defined (%s) in %s' % (
-          cp, len(modifiers),
-          ', '.join('%04x' % cp for cp in sorted(modifiers)), fp), file=sys.stderr)
+      print(
+          'check skintone: base %04x has %d modifiers defined (%s) in %s' % (
+              cp, len(modifiers),
+              ', '.join('%04x' % cp for cp in sorted(modifiers)), fp),
+          file=sys.stderr)
 
 
-def _check_zwj_sequences(seq_to_filepath):
-  """Verify that zwj sequences are valid."""
-  zwj_sequence_to_name = unicode_data.get_emoji_zwj_sequences()
-  # strip emoji variant selectors and add extra mappings
-  zwj_sequence_without_vs_to_name_canonical = {}
-  for seq, seq_name in zwj_sequence_to_name.iteritems():
-    if EMOJI_VS in seq:
-      stripped_seq = strip_vs(seq)
-      zwj_sequence_without_vs_to_name_canonical[stripped_seq] = (seq_name, seq)
-
-  zwj_seq_to_filepath = {
-      seq: fp for seq, fp in seq_to_filepath.iteritems()
-      if ZWJ in seq}
-
-  for seq, fp in zwj_seq_to_filepath.iteritems():
-    if seq not in zwj_sequence_to_name:
-      if seq not in zwj_sequence_without_vs_to_name_canonical:
-        print('zwj sequence not defined: %s' % fp, file=sys.stderr)
-      else:
-        _, can = zwj_sequence_without_vs_to_name_canonical[seq]
-        # print >> sys.stderr, 'canonical sequence %s contains vs: %s' % (
-        #     _seq_string(can), fp)
-
-def read_emoji_aliases():
-  result = {}
-
-  with open(path.join(DATA_ROOT, 'emoji_aliases.txt'), 'r') as f:
-    for line in f:
-      ix = line.find('#')
-      if (ix > -1):
-        line = line[:ix]
-      line = line.strip()
-      if not line:
-        continue
-      als, trg = (s.strip() for s in line.split(';'))
-      als_seq = tuple([int(x, 16) for x in als.split('_')])
-      try:
-        trg_seq = tuple([int(x, 16) for x in trg.split('_')])
-      except:
-        print('cannot process alias %s -> %s' % (als, trg))
-        continue
-      result[als_seq] = trg_seq
-  return result
+def _check_zwj_sequences(sorted_seq_to_filepath, unicode_version):
+  """Verify that zwj sequences are valid for the given unicode version."""
+  for seq, fp in sorted_seq_to_filepath.iteritems():
+    if ZWJ not in seq:
+      continue
+    age = unicode_data.get_emoji_sequence_age(seq)
+    if age is None or unicode_version is not None and age > unicode_version:
+      print('check zwj sequences: undefined sequence %s' % fp)
 
 
-def _check_coverage(seq_to_filepath):
-  age = 9.0
+def _check_no_alias_sources(sorted_seq_to_filepath):
+  """Check that we don't have sequences that we expect to be aliased to
+  some other sequence."""
+  aliases = add_aliases.read_default_emoji_aliases()
+  for seq, fp in sorted_seq_to_filepath.iteritems():
+    if seq in aliases:
+      print('check no alias sources: aliased sequence %s' % fp)
+
+
+def _check_coverage(seq_to_filepath, unicode_version):
+  """Ensure we have all and only the cps and sequences that we need for the
+  font as of this version."""
+
+  age = unicode_version
 
   non_vs_to_canonical = {}
   for k in seq_to_filepath:
     if EMOJI_VS in k:
-      non_vs = strip_vs(k)
+      non_vs = unicode_data.strip_emoji_vs(k)
       non_vs_to_canonical[non_vs] = k
 
-  aliases = read_emoji_aliases()
+  aliases = add_aliases.read_default_emoji_aliases()
   for k, v in sorted(aliases.items()):
     if v not in seq_to_filepath and v not in non_vs_to_canonical:
-      print('alias %s missing target %s' % (_seq_string(k), _seq_string(v)))
+      alias_str = unicode_data.seq_to_string(k)
+      target_str = unicode_data.seq_to_string(v)
+      print('coverage: alias %s missing target %s' % (alias_str, target_str))
       continue
     if k in seq_to_filepath or k in non_vs_to_canonical:
-      print('alias %s already exists as %s (%s)' % (
-          _seq_string(k), _seq_string(v), seq_name(v)))
+      alias_str = unicode_data.seq_to_string(k)
+      target_str = unicode_data.seq_to_string(v)
+      print('coverage: alias %s already exists as %s (%s)' % (
+          alias_str, target_str, seq_name(v)))
       continue
     filename = seq_to_filepath.get(v) or seq_to_filepath[non_vs_to_canonical[v]]
     seq_to_filepath[k] = 'alias:' + filename
@@ -243,13 +273,15 @@ def _check_coverage(seq_to_filepath):
   emoji = sorted(unicode_data.get_emoji(age=age))
   for cp in emoji:
     if tuple([cp]) not in seq_to_filepath:
-      print('missing single %04x (%s)' % (cp, unicode_data.name(cp, '<no name>')))
+      print(
+          'coverage: missing single %04x (%s)' % (
+              cp, unicode_data.name(cp, '<no name>')))
 
   # special characters
   # all but combining enclosing keycap are currently marked as emoji
   for cp in [ord('*'), ord('#'), ord(u'\u20e3')] + range(0x30, 0x3a):
     if cp not in emoji and tuple([cp]) not in seq_to_filepath:
-      print('missing special %04x (%s)' % (cp, unicode_data.name(cp)))
+      print('coverage: missing special %04x (%s)' % (cp, unicode_data.name(cp)))
 
   # combining sequences
   comb_seq_to_name = sorted(
@@ -257,24 +289,26 @@ def _check_coverage(seq_to_filepath):
   for seq, name in comb_seq_to_name:
     if seq not in seq_to_filepath:
       # strip vs and try again
-      non_vs_seq = strip_vs(seq)
+      non_vs_seq = unicode_data.strip_emoji_vs(seq)
       if non_vs_seq not in seq_to_filepath:
-        print('missing combining sequence %s (%s)' % (_seq_string(seq), name))
+        print('coverage: missing combining sequence %s (%s)' %
+              (unicode_data.seq_to_string(seq), name))
 
   # flag sequences
   flag_seq_to_name = sorted(
       unicode_data.get_emoji_flag_sequences(age=age).iteritems())
   for seq, name in flag_seq_to_name:
     if seq not in seq_to_filepath:
-      print('missing flag sequence %s (%s)' % (_seq_string(seq), name))
+      print('coverage: missing flag sequence %s (%s)' %
+            (unicode_data.seq_to_string(seq), name))
 
   # skin tone modifier sequences
   mod_seq_to_name = sorted(
       unicode_data.get_emoji_modifier_sequences(age=age).iteritems())
   for seq, name in mod_seq_to_name:
     if seq not in seq_to_filepath:
-      print('missing modifier sequence %s (%s)' % (
-          _seq_string(seq), name))
+      print('coverage: missing modifier sequence %s (%s)' % (
+          unicode_data.seq_to_string(seq), name))
 
   # zwj sequences
   # some of ours include the emoji presentation variation selector and some
@@ -295,25 +329,30 @@ def _check_coverage(seq_to_filepath):
     else:
       test_seq = seq
     if test_seq not in zwj_seq_without_vs:
-      print('missing (canonical) zwj sequence %s (%s)' % (
-          _seq_string(seq), name))
+      print('coverage: missing (canonical) zwj sequence %s (%s)' % (
+          unicode_data.seq_to_string(seq), name))
 
   # check for 'unknown flag'
-  # this is either emoji_ufe82b or 'unknown_flag', we filter out things that
+  # this is either emoji_ufe82b or 'unknown_flag', but we filter out things that
   # don't start with our prefix so 'unknown_flag' would be excluded by default.
   if tuple([0xfe82b]) not in seq_to_filepath:
-    print('missing unknown flag PUA fe82b')
+    print('coverage: missing unknown flag PUA fe82b')
 
 
-def check_sequence_to_filepath(seq_to_filepath):
+def check_sequence_to_filepath(seq_to_filepath, unicode_version, coverage):
   sorted_seq_to_filepath = collections.OrderedDict(
       sorted(seq_to_filepath.items()))
-  _check_valid_emoji(sorted_seq_to_filepath)
+  _check_no_vs(sorted_seq_to_filepath)
+  _check_valid_emoji_cps(sorted_seq_to_filepath, unicode_version)
   _check_zwj(sorted_seq_to_filepath)
   _check_flags(sorted_seq_to_filepath)
+  _check_tags(sorted_seq_to_filepath)
   _check_skintone(sorted_seq_to_filepath)
-  _check_zwj_sequences(sorted_seq_to_filepath)
-  _check_coverage(sorted_seq_to_filepath)
+  _check_zwj_sequences(sorted_seq_to_filepath, unicode_version)
+  _check_no_alias_sources(sorted_seq_to_filepath)
+  if coverage:
+    _check_coverage(sorted_seq_to_filepath, unicode_version)
+
 
 def create_sequence_to_filepath(name_to_dirpath, prefix, suffix):
   """Check names, and convert name to sequences for names that are ok,
@@ -345,12 +384,15 @@ def create_sequence_to_filepath(name_to_dirpath, prefix, suffix):
   return result
 
 
-def collect_name_to_dirpath(directory, prefix, suffix):
+def collect_name_to_dirpath(directory, prefix, suffix, exclude=None):
   """Return a mapping from filename to path rooted at directory, ignoring files
-  that don't match suffix.  Report when a filename appears in more than one
-  subdir; the first path found is kept."""
+  that don't match suffix, and subtrees with names in exclude.  Report when a
+  filename appears in more than one subdir; the first path found is kept."""
   result = {}
-  for dirname, _, files in os.walk(directory):
+  for dirname, dirs, files in os.walk(directory, topdown=True):
+    if exclude:
+      dirs[:] = [d for d in dirs if d not in exclude]
+
     if directory != '.':
       dirname = path.join(directory, dirname)
     for f in files:
@@ -364,42 +406,57 @@ def collect_name_to_dirpath(directory, prefix, suffix):
   return result
 
 
-def collect_name_to_dirpath_with_override(dirs, prefix, suffix):
+def collect_name_to_dirpath_with_override(dirs, prefix, suffix, exclude=None):
   """Return a mapping from filename to a directory path rooted at a directory
   in dirs, using collect_name_to_filepath.  The last directory is retained. This
   does not report an error if a file appears under more than one root directory,
-  so lets later root directories override earlier ones."""
+  so lets later root directories override earlier ones.  Use 'exclude' to
+  name subdirectories (of any root) whose subtree you wish to skip."""
   result = {}
   for d in dirs:
-    result.update(collect_name_to_dirpath(d, prefix, suffix))
+    result.update(collect_name_to_dirpath(d, prefix, suffix, exclude))
   return result
 
 
-def run_check(dirs, prefix, suffix):
-  print('Checking files with prefix "%s" and suffix "%s" in:\n  %s' % (
-      prefix, suffix, '\n  '.join(dirs)))
+def run_check(dirs, prefix, suffix, exclude, unicode_version, coverage):
+  msg = ''
+  if unicode_version:
+    msg = ' (%3.1f)' % unicode_version
+  print('Checking files with prefix "%s" and suffix "%s"%s in:\n  %s' % (
+      prefix, suffix, msg, '\n  '.join(dirs)))
   name_to_dirpath = collect_name_to_dirpath_with_override(
-      dirs, prefix=prefix, suffix=suffix)
+      dirs, prefix=prefix, suffix=suffix, exclude=exclude)
   print('checking %d names' % len(name_to_dirpath))
   seq_to_filepath = create_sequence_to_filepath(name_to_dirpath, prefix, suffix)
   print('checking %d sequences' % len(seq_to_filepath))
-  check_sequence_to_filepath(seq_to_filepath)
+  check_sequence_to_filepath(seq_to_filepath, unicode_version, coverage)
   print('done.')
 
 
 def main():
   parser = argparse.ArgumentParser()
   parser.add_argument(
-      '-d', '--dirs', help='directories containing emoji images',
+      '-d', '--dirs', help='directory roots containing emoji images',
       metavar='dir', nargs='+', required=True)
+  parser.add_argument(
+      '-e', '--exclude', help='names of source subdirs to exclude',
+      metavar='dir', nargs='+')
+  parser.add_argument(
+      '-c', '--coverage', help='test for complete coverage',
+      action='store_true')
   parser.add_argument(
       '-p', '--prefix', help='prefix to match, default "emoji_u"',
       metavar='pfx', default='emoji_u')
   parser.add_argument(
       '-s', '--suffix', help='suffix to match, default ".png"', metavar='sfx',
       default='.png')
+  parser.add_argument(
+      '-u', '--unicode_version', help='limit to this unicode version or before',
+      metavar='version', type=float)
   args = parser.parse_args()
-  run_check(args.dirs, args.prefix, args.suffix)
+  run_check(
+      args.dirs, args.prefix, args.suffix, args.exclude, args.unicode_version,
+      args.coverage)
 
 
 if __name__ == '__main__':