mirror of
https://github.com/googlefonts/noto-emoji.git
synced 2025-06-08 15:57:59 +00:00
Merge pull request #47 from dougfelt/check_emoji_sequences
Add tool to check/validate emoji sequences in image file naming.
This commit is contained in:
commit
8fba2e60fc
1 changed files with 168 additions and 0 deletions
168
check_emoji_sequences.py
Executable file
168
check_emoji_sequences.py
Executable file
|
@ -0,0 +1,168 @@
|
|||
#!/usr/bin/python
|
||||
#
|
||||
# Copyright 2016 Google Inc. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Compare emoji image file namings against unicode property data."""
|
||||
|
||||
import argparse
|
||||
import collections
|
||||
import glob
|
||||
from os import path
|
||||
import re
|
||||
import sys
|
||||
|
||||
from nototools import unicode_data
|
||||
|
||||
def _is_regional_indicator(cp):
|
||||
return 0x1f1e6 <= cp <= 0x1f1ff
|
||||
|
||||
|
||||
def _is_skintone_modifier(cp):
|
||||
return 0x1f3fb <= cp <= 0x1f3ff
|
||||
|
||||
|
||||
def _seq_string(seq):
|
||||
return '_'.join('%04x' % cp for cp in seq)
|
||||
|
||||
|
||||
def _check_valid_emoji(sorted_seqs):
|
||||
"""Ensure all emoji are either valid emoji or specific chars."""
|
||||
|
||||
valid_cps = set(unicode_data.get_emoji() | unicode_data.proposed_emoji_cps())
|
||||
valid_cps.add(0x200d) # ZWJ
|
||||
valid_cps.add(0x20e3) # combining enclosing keycap
|
||||
valid_cps.add(0xfe0f) # variation selector (emoji presentation)
|
||||
valid_cps.add(0xfe82b) # PUA value for unknown flag
|
||||
|
||||
not_emoji = set()
|
||||
for seq in sorted_seqs:
|
||||
for cp in seq:
|
||||
if cp not in valid_cps:
|
||||
not_emoji.add(cp)
|
||||
|
||||
if len(not_emoji):
|
||||
print >> sys.stderr, '%d non-emoji found:' % len(not_emoji)
|
||||
for cp in sorted(not_emoji):
|
||||
print >> sys.stderr, '%04X' % cp
|
||||
|
||||
|
||||
def _check_zwj(sorted_seqs):
|
||||
"""Ensure zwj is only between two appropriate emoji."""
|
||||
ZWJ = 0x200D
|
||||
EMOJI_PRESENTATION_VS = 0xFE0F
|
||||
|
||||
for seq in sorted_seqs:
|
||||
if ZWJ not in seq:
|
||||
continue
|
||||
if seq[0] == 0x200d:
|
||||
print >> sys.stderr, 'zwj at head of sequence'
|
||||
if len(seq) == 1:
|
||||
continue
|
||||
if seq[-1] == 0x200d:
|
||||
print >> sys.stderr, 'zwj at end of sequence'
|
||||
for i, cp in enumerate(seq):
|
||||
if cp == ZWJ:
|
||||
pcp = seq[i-1]
|
||||
if pcp != EMOJI_PRESENTATION_VS and not unicode_data.is_emoji(pcp):
|
||||
print >> sys.stderr, 'non-emoji %04X preceeds ZWJ' % pcp
|
||||
fcp = seq[i+1]
|
||||
if not unicode_data.is_emoji(fcp):
|
||||
print >> sys.stderr, 'non-emoji %04X follows ZWJ' % fcp
|
||||
|
||||
|
||||
def _check_flags(sorted_seqs):
|
||||
"""Ensure regional indicators are only in sequences of one or two, and
|
||||
never mixed."""
|
||||
for seq in sorted_seqs:
|
||||
have_reg = None
|
||||
for cp in seq:
|
||||
is_reg = _is_regional_indicator(cp)
|
||||
if have_reg == None:
|
||||
have_reg = is_reg
|
||||
elif have_reg != is_reg:
|
||||
print >> sys.stderr, ('mix of regional and non-regional in %s' %
|
||||
_seq_string(seq))
|
||||
if have_reg and len(seq) > 2:
|
||||
# We provide dummy glyphs for regional indicators, so there are sequences
|
||||
# with single regional indicator symbols.
|
||||
print >> sys.stderr, ('regional indicator sequence length != 2: %s' %
|
||||
_seq_string(seq))
|
||||
|
||||
|
||||
def _check_skintone(sorted_seqs):
|
||||
"""Ensure skin tone modifiers are not applied to emoji that are not defined
|
||||
to take them. May appear standalone, though. Also check that emoji that take
|
||||
skin tone modifiers have a complete set."""
|
||||
base_to_modifiers = collections.defaultdict(set)
|
||||
for seq in sorted_seqs:
|
||||
for i, cp in enumerate(seq):
|
||||
if _is_skintone_modifier(cp):
|
||||
if i == 0:
|
||||
if len(seq) > 1:
|
||||
print >> sys.stderr, 'skin color selector first in sequence %s'
|
||||
# standalone are ok
|
||||
continue
|
||||
pcp = seq[i-1]
|
||||
if not unicode_data.is_emoji_modifier_base(pcp):
|
||||
print >> sys.stderr, (
|
||||
'emoji skintone modifier applied to non-base at %d: %s' % (
|
||||
i, _seq_string(seq)))
|
||||
elif unicode_data.is_emoji_modifier_base(cp):
|
||||
if i < len(seq) - 1 and _is_skintone_modifier(seq[i+1]):
|
||||
base_to_modifiers[cp].add(seq[i+1])
|
||||
elif cp not in base_to_modifiers:
|
||||
base_to_modifiers[cp] = set()
|
||||
for cp, modifiers in sorted(base_to_modifiers.iteritems()):
|
||||
if len(modifiers) != 5:
|
||||
print 'emoji base %04X has %d modifiers defined (%s)' % (
|
||||
cp, len(modifiers),
|
||||
', '.join('%04x' % cp for cp in sorted(modifiers)))
|
||||
|
||||
def check_sequences(seqs):
|
||||
sorted_seqs = sorted(seqs)
|
||||
_check_valid_emoji(sorted_seqs)
|
||||
_check_zwj(sorted_seqs)
|
||||
_check_flags(sorted_seqs)
|
||||
_check_skintone(sorted_seqs)
|
||||
|
||||
|
||||
def _collect_sequences(dirs, prefix='emoji_u'):
|
||||
seqs = set()
|
||||
path_re = re.compile('%s([a-zA-Z0-9_]+)\.png' % prefix)
|
||||
for d in dirs:
|
||||
for f in glob.glob(path.join(d, '%s*.png' % prefix)):
|
||||
m = path_re.match(path.basename(f))
|
||||
if not m:
|
||||
print >> sys.stderr, 'could not match file "%s"' % f
|
||||
continue
|
||||
seq = tuple(int(s, 16) for s in m.group(1).split('_'))
|
||||
if seq in seqs:
|
||||
print >> sys.stderr, 'duplicate sequence for "%s"' % f
|
||||
continue
|
||||
seqs.add(seq)
|
||||
return seqs
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
'-d', '--dirs', help='directories containing emoji images',
|
||||
metavar='dir', nargs='+', required=True)
|
||||
args = parser.parse_args()
|
||||
check_sequences(_collect_sequences(args.dirs))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Loading…
Add table
Reference in a new issue