From 3e7a9bebe25bb89a4f05593ea143e37cf87f35f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mislav=20Marohni=C4=87?= Date: Thu, 2 May 2019 10:00:25 +0200 Subject: [PATCH] Parse emoji-test.txt from Unicode --- .gitignore | 1 - Rakefile | 16 +---- db/dump.rb | 151 +++++++++++------------------------------------ db/emoji-test.rb | 121 +++++++++++++++++++++++++++++++++++++ 4 files changed, 155 insertions(+), 134 deletions(-) create mode 100644 db/emoji-test.rb diff --git a/.gitignore b/.gitignore index 05300fa..a405fe8 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,5 @@ .ruby-version Gemfile.lock db/emoji-test.txt -db/ucd.nounihan.grouped.xml images/unicode/*.png vendor/ diff --git a/Rakefile b/Rakefile index e901948..2479d08 100644 --- a/Rakefile +++ b/Rakefile @@ -9,10 +9,8 @@ end namespace :db do desc %(Generate Emoji data files needed for development) - task generate: [ - 'db/ucd.nounihan.grouped.xml', + task :generate => [ 'db/emoji-test.txt', - 'db/raw-emoji.json' ] desc %(Dump a list of supported Emoji with Unicode descriptions and aliases) @@ -21,18 +19,6 @@ namespace :db do end end -file 'db/raw-emoji.json' do |t| - system 'curl', '-fsSL', 'https://raw.githack.com/koddsson/emoji.json/11.0.0/emoji.json', '-o', t.name -end - -file 'db/ucd.nounihan.grouped.xml' do - Dir.chdir('db') do - system 'curl', '-fsSLO', 'http://www.unicode.org/Public/11.0.0/ucdxml/ucd.nounihan.grouped.zip' - system 'unzip', '-q', 'ucd.nounihan.grouped.zip' - rm 'ucd.nounihan.grouped.zip' - end -end - file 'db/emoji-test.txt' do |t| system 'curl', '-fsSL', 'http://unicode.org/Public/emoji/11.0/emoji-test.txt', '-o', t.name end diff --git a/db/dump.rb b/db/dump.rb index 4bcb6fe..c09a3e7 100644 --- a/db/dump.rb +++ b/db/dump.rb @@ -1,128 +1,41 @@ +# frozen_string_literal: true + require 'emoji' require 'json' -require 'rexml/document' - -class UnicodeCharacter - attr_reader :code, :description, :version, :aliases - - class CharListener - CHAR_TAG = "char".freeze - - def self.parse(io, &block) - REXML::Document.parse_stream(io, self.new(&block)) - end - - def initialize(&block) - @callback = block - end - - def tag_start(name, attributes) - if CHAR_TAG == name - @callback.call( - attributes.fetch("cp") { return }, - attributes.fetch("na") { return }, - attributes.fetch("age", nil), - ) - end - end - - def method_missing(*) end - end - - def self.index - return @index if defined? @index - @index = {} - File.open(File.expand_path('../ucd.nounihan.grouped.xml', __FILE__)) do |source| - CharListener.parse(source) do |char, desc, age| - uc = UnicodeCharacter.new(char, desc, age) - @index[uc.code] = uc - end - end - @index - end - - def self.fetch(code) - code = code.to_s(16).rjust(4, '0') if code.is_a?(Integer) - self.index.fetch(code) - end - - def initialize(code, description, version) - @code = code.downcase - @description = description.downcase - @version = version - @aliases = [] - @references = [] - end - - def add_alias(string) - @aliases.concat string.split(/\s*,\s*/) - end - - def add_reference(code) - @references << code.downcase - end -end - -unless $stdin.tty? - codepoints = STDIN.read.chomp.codepoints.map { |code| - UnicodeCharacter.fetch(code) - } - codepoints.each do |char| - printf "%5s: %s", char.code.upcase, char.description - printf " (%s)", char.version if char.version - puts - end - exit -end - -trap(:PIPE) { abort } - -normalize = -> (raw) { - raw.sub(Emoji::VARIATION_SELECTOR_16, '') -} - -emojidesc = {} -File.open(File.expand_path('../emoji-test.txt', __FILE__)) do |file| - file.each do |line| - next if line =~ /^(#|$)/ - line = line.chomp.split('# ', 2)[1] - emoji, description = line.split(' ', 2) - emojidesc[normalize.(emoji)] = description - end -end +require_relative './emoji-test' items = [] -for category, emojis in Emoji.palette - for raw in emojis - emoji = Emoji.find_by_unicode(raw) - unicode_version = emoji ? emoji.unicode_version : '' - ios_version = emoji ? emoji.ios_version : '' +_, categories = EmojiTestParser.parse - unless raw.include?(Emoji::ZERO_WIDTH_JOINER) - uchar = UnicodeCharacter.fetch(raw.codepoints[0]) - unicode_version = uchar.version unless uchar.version.nil? +for category in categories + for sub_category in category[:emoji] + for emoji_item in sub_category[:emoji] + raw = emoji_item[:sequences][0] + existing_emoji = Emoji.find_by_unicode(raw) || Emoji.find_by_unicode("#{raw}\u{fe0f}") + output_item = { + emoji: raw, + description: emoji_item[:description], + category: category[:name], + } + if existing_emoji + output_item.update( + aliases: existing_emoji.aliases, + tags: existing_emoji.tags, + unicode_version: existing_emoji.unicode_version, + ios_version: existing_emoji.ios_version, + ) + else + output_item.update( + aliases: [emoji_item[:description].gsub(/\W+/, '_').downcase], + tags: [], + unicode_version: "11.0", + ios_version: "12.1", + ) + end + output_item[:skin_tones] = true if emoji_item[:skin_tones] + items << output_item end - - description = emojidesc.fetch(normalize.(raw)) - - if unicode_version == '' - warn "#{description} (#{raw}) doesn't have Unicode version" - end - - if ios_version == '' - ios_version = '10.2' - end - - items << { - emoji: raw, - description: description, - category: category, - aliases: emoji ? emoji.aliases : [description.gsub(/\W+/, '_').downcase], - tags: emoji ? emoji.tags : [], - unicode_version: unicode_version, - ios_version: ios_version, - } end end @@ -133,6 +46,8 @@ for emoji in Emoji.all.select(&:custom?) } end +trap(:PIPE) { abort } + puts JSON.pretty_generate(items) .gsub("\n\n", "\n") .gsub(/,\n( +)/) { "\n%s, " % $1[2..-1] } diff --git a/db/emoji-test.rb b/db/emoji-test.rb new file mode 100644 index 0000000..67b5661 --- /dev/null +++ b/db/emoji-test.rb @@ -0,0 +1,121 @@ +# frozen_string_literal: true + +module EmojiTestParser + VARIATION_SELECTOR_16 = "\u{fe0f}" + SKIN_TONES = [ + "\u{1F3FB}", # light skin tone + "\u{1F3FC}", # medium-light skin tone + "\u{1F3FD}", # medium skin tone + "\u{1F3FE}", # medium-dark skin tone + "\u{1F3FF}", # dark skin tone + ] + HAIR_MODIFIERS = [ + "\u{1F9B0}", # red-haired + "\u{1F9B1}", # curly-haired + "\u{1F9B2}", # bald + "\u{1F9B3}", # white-haired + ] + + module_function + + def parse + File.open(File.expand_path("../emoji-test.txt", __FILE__), "r:utf-8") do |file| + parse_file(file) + end + end + + def parse_file(io) + data = [] + emoji_map = {} + category = nil + sub_category = nil + + io.each do |line| + begin + if line.start_with?("# group: ") + _, group_name = line.split(":", 2) + category = { + name: group_name.strip, + emoji: [], + } + data << category + sub_category = nil + elsif line.start_with?("# subgroup: ") + _, group_name = line.split(":", 2) + sub_category = { + name: group_name.strip, + emoji: [], + } + category[:emoji] << sub_category + elsif line.start_with?("#") || line.strip.empty? + next + else + row, desc = line.split("#", 2) + desc = desc.strip.split(" ", 2)[1] + codepoints, _ = row.split(";", 2) + emoji_raw = codepoints.strip.split.map { |c| c.hex }.pack("U*") + next if HAIR_MODIFIERS.include?(emoji_raw) + emoji_normalized = emoji_raw + .gsub(VARIATION_SELECTOR_16, "") + .gsub(/(#{SKIN_TONES.join("|")})/o, "") + emoji_item = emoji_map[emoji_normalized] + if desc.end_with?(" skin tone") + emoji_item[:skin_tones] = true if emoji_item + next + end + if emoji_item + emoji_item[:sequences] << emoji_raw + else + emoji_item = { + sequences: [emoji_raw], + description: desc, + } + emoji_map[emoji_normalized] = emoji_item + sub_category[:emoji] << emoji_item + end + end + rescue + warn "line: %p" % line + raise + end + end + + [emoji_map, data] + end +end + +if $0 == __FILE__ + html_output = false + if ARGV[0] == "--html" + ARGV.shift + html_output = true + end + + _, categories = EmojiTestParser.parse + + trap(:PIPE) { abort } + + if html_output + puts "" + puts "" + for category in categories + puts "

#{category[:name]}

" + for sub_category in category[:emoji] + puts "

#{sub_category[:name]}

" + puts "
    " + for char in sub_category[:emoji] + puts "
  1. " + for sequence in char[:sequences] + codepoints = sequence.unpack("U*").map { |c| c.to_s(16).upcase }.join(" ") + printf '%s ', codepoints, sequence + end + puts "#{char[:description]}
  2. " + end + puts "
" + end + end + else + require "json" + puts JSON.pretty_generate(categories) + end +end