Merge pull request #83 from dougfelt/emoji_checK

Update emoji sequence check to be more flexible.
This commit is contained in:
dougfelt 2017-01-19 10:48:01 -08:00 committed by GitHub
commit 561d08a89c
2 changed files with 116 additions and 54 deletions

View file

@ -1,4 +1,4 @@
#!/usr/bin/python #!/usr/bin/env python
# #
# Copyright 2016 Google Inc. All rights reserved. # Copyright 2016 Google Inc. All rights reserved.
# #
@ -19,6 +19,7 @@
import argparse import argparse
import collections import collections
import glob import glob
import os
from os import path from os import path
import re import re
import sys import sys
@ -37,7 +38,7 @@ def _seq_string(seq):
return '_'.join('%04x' % cp for cp in seq) return '_'.join('%04x' % cp for cp in seq)
def _check_valid_emoji(sorted_seqs): def _check_valid_emoji(sorted_seq_to_filepath):
"""Ensure all emoji are either valid emoji or specific chars.""" """Ensure all emoji are either valid emoji or specific chars."""
valid_cps = set(unicode_data.get_emoji() | unicode_data.proposed_emoji_cps()) valid_cps = set(unicode_data.get_emoji() | unicode_data.proposed_emoji_cps())
@ -46,79 +47,80 @@ def _check_valid_emoji(sorted_seqs):
valid_cps.add(0xfe0f) # variation selector (emoji presentation) valid_cps.add(0xfe0f) # variation selector (emoji presentation)
valid_cps.add(0xfe82b) # PUA value for unknown flag valid_cps.add(0xfe82b) # PUA value for unknown flag
not_emoji = set() not_emoji = {}
for seq in sorted_seqs: for seq, fp in sorted_seq_to_filepath.iteritems():
for cp in seq: for cp in seq:
if cp not in valid_cps: if cp not in valid_cps:
not_emoji.add(cp) if cp not in not_emoji:
not_emoji[cp] = []
not_emoji[cp].append(fp)
if len(not_emoji): if len(not_emoji):
print >> sys.stderr, '%d non-emoji found:' % len(not_emoji) print >> sys.stderr, '%d non-emoji found:' % len(not_emoji)
for cp in sorted(not_emoji): for cp in sorted(not_emoji):
print >> sys.stderr, '%04X' % cp print >> sys.stderr, '%04x (in %s)' % (cp, ', '.join(not_emoji[cp]))
def _check_zwj(sorted_seqs): def _check_zwj(sorted_seq_to_filepath):
"""Ensure zwj is only between two appropriate emoji.""" """Ensure zwj is only between two appropriate emoji."""
ZWJ = 0x200D ZWJ = 0x200D
EMOJI_PRESENTATION_VS = 0xFE0F EMOJI_PRESENTATION_VS = 0xFE0F
for seq in sorted_seqs: for seq, fp in sorted_seq_to_filepath.iteritems():
if ZWJ not in seq: if ZWJ not in seq:
continue continue
if seq[0] == 0x200d: if seq[0] == 0x200d:
print >> sys.stderr, 'zwj at head of sequence' print >> sys.stderr, 'zwj at head of sequence in %s' % fp
if len(seq) == 1: if len(seq) == 1:
continue continue
if seq[-1] == 0x200d: if seq[-1] == 0x200d:
print >> sys.stderr, 'zwj at end of sequence' print >> sys.stderr, 'zwj at end of sequence in %s' % fp
for i, cp in enumerate(seq): for i, cp in enumerate(seq):
if cp == ZWJ: if cp == ZWJ:
if i > 0:
pcp = seq[i-1] pcp = seq[i-1]
if pcp != EMOJI_PRESENTATION_VS and not unicode_data.is_emoji(pcp): if pcp != EMOJI_PRESENTATION_VS and not unicode_data.is_emoji(pcp):
print >> sys.stderr, 'non-emoji %04X preceeds ZWJ' % pcp print >> sys.stderr, 'non-emoji %04x preceeds ZWJ in %s' % (pcp, fp)
if i < len(seq) - 1:
fcp = seq[i+1] fcp = seq[i+1]
if not unicode_data.is_emoji(fcp): if not unicode_data.is_emoji(fcp):
print >> sys.stderr, 'non-emoji %04X follows ZWJ' % fcp print >> sys.stderr, 'non-emoji %04x follows ZWJ in %s' % (fcp, fp)
def _check_flags(sorted_seqs): def _check_flags(sorted_seq_to_filepath):
"""Ensure regional indicators are only in sequences of one or two, and """Ensure regional indicators are only in sequences of one or two, and
never mixed.""" never mixed."""
for seq in sorted_seqs: for seq, fp in sorted_seq_to_filepath.iteritems():
have_reg = None have_reg = None
for cp in seq: for cp in seq:
is_reg = _is_regional_indicator(cp) is_reg = _is_regional_indicator(cp)
if have_reg == None: if have_reg == None:
have_reg = is_reg have_reg = is_reg
elif have_reg != is_reg: elif have_reg != is_reg:
print >> sys.stderr, ('mix of regional and non-regional in %s' % print >> sys.stderr, 'mix of regional and non-regional in %s' % fp
_seq_string(seq))
if have_reg and len(seq) > 2: if have_reg and len(seq) > 2:
# We provide dummy glyphs for regional indicators, so there are sequences # We provide dummy glyphs for regional indicators, so there are sequences
# with single regional indicator symbols. # with single regional indicator symbols.
print >> sys.stderr, ('regional indicator sequence length != 2: %s' % print >> sys.stderr, 'regional indicator sequence length != 2 in %s' % fp
_seq_string(seq))
def _check_skintone(sorted_seqs): def _check_skintone(sorted_seq_to_filepath):
"""Ensure skin tone modifiers are not applied to emoji that are not defined """Ensure skin tone modifiers are not applied to emoji that are not defined
to take them. May appear standalone, though. Also check that emoji that take to take them. May appear standalone, though. Also check that emoji that take
skin tone modifiers have a complete set.""" skin tone modifiers have a complete set."""
base_to_modifiers = collections.defaultdict(set) base_to_modifiers = collections.defaultdict(set)
for seq in sorted_seqs: for seq, fp in sorted_seq_to_filepath.iteritems():
for i, cp in enumerate(seq): for i, cp in enumerate(seq):
if _is_skintone_modifier(cp): if _is_skintone_modifier(cp):
if i == 0: if i == 0:
if len(seq) > 1: if len(seq) > 1:
print >> sys.stderr, 'skin color selector first in sequence %s' print >> sys.stderr, 'skin color selector first in sequence %s' % fp
# standalone are ok # standalone are ok
continue continue
pcp = seq[i-1] pcp = seq[i-1]
if not unicode_data.is_emoji_modifier_base(pcp): if not unicode_data.is_emoji_modifier_base(pcp):
print >> sys.stderr, ( print >> sys.stderr, (
'emoji skintone modifier applied to non-base at %d: %s' % ( 'emoji skintone modifier applied to non-base at %d: %s' % (i, fp))
i, _seq_string(seq)))
elif unicode_data.is_emoji_modifier_base(cp): elif unicode_data.is_emoji_modifier_base(cp):
if i < len(seq) - 1 and _is_skintone_modifier(seq[i+1]): if i < len(seq) - 1 and _is_skintone_modifier(seq[i+1]):
base_to_modifiers[cp].add(seq[i+1]) base_to_modifiers[cp].add(seq[i+1])
@ -126,36 +128,90 @@ def _check_skintone(sorted_seqs):
base_to_modifiers[cp] = set() base_to_modifiers[cp] = set()
for cp, modifiers in sorted(base_to_modifiers.iteritems()): for cp, modifiers in sorted(base_to_modifiers.iteritems()):
if len(modifiers) != 5: if len(modifiers) != 5:
print 'emoji base %04X has %d modifiers defined (%s)' % ( print 'emoji base %04x has %d modifiers defined (%s) in %s' % (
cp, len(modifiers), cp, len(modifiers),
', '.join('%04x' % cp for cp in sorted(modifiers))) ', '.join('%04x' % cp for cp in sorted(modifiers)), fp)
def check_sequences(seqs): def check_sequence_to_filepath(seq_to_filepath):
sorted_seqs = sorted(seqs) sorted_seq_to_filepath = collections.OrderedDict(
print 'checking %d sequences' % len(seqs) sorted(seq_to_filepath.items()))
_check_valid_emoji(sorted_seqs) _check_valid_emoji(sorted_seq_to_filepath)
_check_zwj(sorted_seqs) _check_zwj(sorted_seq_to_filepath)
_check_flags(sorted_seqs) _check_flags(sorted_seq_to_filepath)
_check_skintone(sorted_seqs) _check_skintone(sorted_seq_to_filepath)
print 'done.'
def _collect_sequences(dirs, prefix='emoji_u'): def create_sequence_to_filepath(name_to_dirpath, prefix, suffix):
seqs = set() """Check names, and convert name to sequences for names that are ok,
path_re = re.compile('%s([a-zA-Z0-9_]+)\.png' % prefix) returning a sequence to file path mapping. Reports bad segments
of a name to stderr."""
segment_re = re.compile(r'^[0-9a-f]{4,6}$')
result = {}
for name, dirname in name_to_dirpath.iteritems():
if not name.startswith(prefix):
print 'expected prefix "%s" for "%s"' % (prefix, name)
continue
segments = name[len(prefix): -len(suffix)].split('_')
segfail = False
seq = []
for s in segments:
if not segment_re.match(s):
print 'bad codepoint name "%s" in %s/%s' % (s, dirname, name)
segfail = True
continue
n = int(s, 16)
if n > 0x10ffff:
print 'codepoint "%s" out of range in %s/%s' % (s, dirname, name)
segfail = True
continue
seq.append(n)
if not segfail:
result[tuple(seq)] = path.join(dirname, name)
return result
def collect_name_to_dirpath(directory, prefix, suffix):
"""Return a mapping from filename to path rooted at directory, ignoring files
that don't match suffix. Report when a filename appears in more than one
subdir; the first path found is kept."""
result = {}
for dirname, _, files in os.walk(directory):
if directory != '.':
dirname = path.join(directory, dirname)
for f in files:
if not f.endswith(suffix):
continue
if f in result:
print >> sys.stderr, 'duplicate file "%s" in %s and %s ' % (
f, dirname, result[f])
continue
result[f] = dirname
return result
def collect_name_to_dirpath_with_override(dirs, prefix, suffix):
"""Return a mapping from filename to a directory path rooted at a directory
in dirs, using collect_name_to_filepath. The last directory is retained. This
does not report an error if a file appears under more than one root directory,
so lets later root directories override earlier ones."""
result = {}
for d in dirs: for d in dirs:
for f in glob.glob(path.join(d, '%s*.png' % prefix)): result.update(collect_name_to_dirpath(d, prefix, suffix))
m = path_re.match(path.basename(f)) return result
if not m:
print >> sys.stderr, 'could not match file "%s"' % f
continue def run_check(dirs, prefix, suffix):
seq = tuple(int(s, 16) for s in m.group(1).split('_')) print 'Checking files with prefix "%s" and suffix "%s" in:\n %s' % (
if seq in seqs: prefix, suffix, '\n '.join(dirs))
print >> sys.stderr, 'duplicate sequence for "%s"' % f name_to_dirpath = collect_name_to_dirpath_with_override(
continue dirs, prefix=prefix, suffix=suffix)
seqs.add(seq) print 'checking %d names' % len(name_to_dirpath)
return seqs seq_to_filepath = create_sequence_to_filepath(name_to_dirpath, prefix, suffix)
print 'checking %d sequences' % len(seq_to_filepath)
check_sequence_to_filepath(seq_to_filepath)
print 'done.'
def main(): def main():
@ -163,8 +219,14 @@ def main():
parser.add_argument( parser.add_argument(
'-d', '--dirs', help='directories containing emoji images', '-d', '--dirs', help='directories containing emoji images',
metavar='dir', nargs='+', required=True) metavar='dir', nargs='+', required=True)
parser.add_argument(
'-p', '--prefix', help='prefix to match, default "emoji_u"',
metavar='pfx', default='emoji_u')
parser.add_argument(
'-s', '--suffix', help='suffix to match, default ".png"', metavar='sfx',
default='.png')
args = parser.parse_args() args = parser.parse_args()
check_sequences(_collect_sequences(args.dirs)) run_check(args.dirs, args.prefix, args.suffix)
if __name__ == '__main__': if __name__ == '__main__':

View file

@ -204,9 +204,9 @@ def _generate_content(basedir, font, dir_infos, limit, annotate, standalone):
if abs_srcdir == basedir: if abs_srcdir == basedir:
dirspec = '' dirspec = ''
elif abs_srcdir.startswith(basedir): elif abs_srcdir.startswith(basedir):
dirspec = abs_filedir[len(abs_basedir) + 1:] dirspec = abs_srcdir[len(basedir) + 1:]
else: else:
dirspec = abs_filedir dirspec = abs_srcdir
basepaths.append(dirspec) basepaths.append(dirspec)
lines = ['<table>'] lines = ['<table>']