Parse emoji-test.txt from Unicode

This commit is contained in:
Mislav Marohnić
2019-05-02 10:00:25 +02:00
parent 70b65c3b10
commit 3e7a9bebe2
4 changed files with 155 additions and 134 deletions

1
.gitignore vendored
View File

@@ -4,6 +4,5 @@
.ruby-version
Gemfile.lock
db/emoji-test.txt
db/ucd.nounihan.grouped.xml
images/unicode/*.png
vendor/

View File

@@ -9,10 +9,8 @@ end
namespace :db do
desc %(Generate Emoji data files needed for development)
task generate: [
'db/ucd.nounihan.grouped.xml',
task :generate => [
'db/emoji-test.txt',
'db/raw-emoji.json'
]
desc %(Dump a list of supported Emoji with Unicode descriptions and aliases)
@@ -21,18 +19,6 @@ namespace :db do
end
end
file 'db/raw-emoji.json' do |t|
system 'curl', '-fsSL', 'https://raw.githack.com/koddsson/emoji.json/11.0.0/emoji.json', '-o', t.name
end
file 'db/ucd.nounihan.grouped.xml' do
Dir.chdir('db') do
system 'curl', '-fsSLO', 'http://www.unicode.org/Public/11.0.0/ucdxml/ucd.nounihan.grouped.zip'
system 'unzip', '-q', 'ucd.nounihan.grouped.zip'
rm 'ucd.nounihan.grouped.zip'
end
end
file 'db/emoji-test.txt' do |t|
system 'curl', '-fsSL', 'http://unicode.org/Public/emoji/11.0/emoji-test.txt', '-o', t.name
end

View File

@@ -1,128 +1,41 @@
# frozen_string_literal: true
require 'emoji'
require 'json'
require 'rexml/document'
class UnicodeCharacter
attr_reader :code, :description, :version, :aliases
class CharListener
CHAR_TAG = "char".freeze
def self.parse(io, &block)
REXML::Document.parse_stream(io, self.new(&block))
end
def initialize(&block)
@callback = block
end
def tag_start(name, attributes)
if CHAR_TAG == name
@callback.call(
attributes.fetch("cp") { return },
attributes.fetch("na") { return },
attributes.fetch("age", nil),
)
end
end
def method_missing(*) end
end
def self.index
return @index if defined? @index
@index = {}
File.open(File.expand_path('../ucd.nounihan.grouped.xml', __FILE__)) do |source|
CharListener.parse(source) do |char, desc, age|
uc = UnicodeCharacter.new(char, desc, age)
@index[uc.code] = uc
end
end
@index
end
def self.fetch(code)
code = code.to_s(16).rjust(4, '0') if code.is_a?(Integer)
self.index.fetch(code)
end
def initialize(code, description, version)
@code = code.downcase
@description = description.downcase
@version = version
@aliases = []
@references = []
end
def add_alias(string)
@aliases.concat string.split(/\s*,\s*/)
end
def add_reference(code)
@references << code.downcase
end
end
unless $stdin.tty?
codepoints = STDIN.read.chomp.codepoints.map { |code|
UnicodeCharacter.fetch(code)
}
codepoints.each do |char|
printf "%5s: %s", char.code.upcase, char.description
printf " (%s)", char.version if char.version
puts
end
exit
end
trap(:PIPE) { abort }
normalize = -> (raw) {
raw.sub(Emoji::VARIATION_SELECTOR_16, '')
}
emojidesc = {}
File.open(File.expand_path('../emoji-test.txt', __FILE__)) do |file|
file.each do |line|
next if line =~ /^(#|$)/
line = line.chomp.split('# ', 2)[1]
emoji, description = line.split(' ', 2)
emojidesc[normalize.(emoji)] = description
end
end
require_relative './emoji-test'
items = []
for category, emojis in Emoji.palette
for raw in emojis
emoji = Emoji.find_by_unicode(raw)
unicode_version = emoji ? emoji.unicode_version : ''
ios_version = emoji ? emoji.ios_version : ''
_, categories = EmojiTestParser.parse
unless raw.include?(Emoji::ZERO_WIDTH_JOINER)
uchar = UnicodeCharacter.fetch(raw.codepoints[0])
unicode_version = uchar.version unless uchar.version.nil?
for category in categories
for sub_category in category[:emoji]
for emoji_item in sub_category[:emoji]
raw = emoji_item[:sequences][0]
existing_emoji = Emoji.find_by_unicode(raw) || Emoji.find_by_unicode("#{raw}\u{fe0f}")
output_item = {
emoji: raw,
description: emoji_item[:description],
category: category[:name],
}
if existing_emoji
output_item.update(
aliases: existing_emoji.aliases,
tags: existing_emoji.tags,
unicode_version: existing_emoji.unicode_version,
ios_version: existing_emoji.ios_version,
)
else
output_item.update(
aliases: [emoji_item[:description].gsub(/\W+/, '_').downcase],
tags: [],
unicode_version: "11.0",
ios_version: "12.1",
)
end
output_item[:skin_tones] = true if emoji_item[:skin_tones]
items << output_item
end
description = emojidesc.fetch(normalize.(raw))
if unicode_version == ''
warn "#{description} (#{raw}) doesn't have Unicode version"
end
if ios_version == ''
ios_version = '10.2'
end
items << {
emoji: raw,
description: description,
category: category,
aliases: emoji ? emoji.aliases : [description.gsub(/\W+/, '_').downcase],
tags: emoji ? emoji.tags : [],
unicode_version: unicode_version,
ios_version: ios_version,
}
end
end
@@ -133,6 +46,8 @@ for emoji in Emoji.all.select(&:custom?)
}
end
trap(:PIPE) { abort }
puts JSON.pretty_generate(items)
.gsub("\n\n", "\n")
.gsub(/,\n( +)/) { "\n%s, " % $1[2..-1] }

121
db/emoji-test.rb Normal file
View File

@@ -0,0 +1,121 @@
# frozen_string_literal: true
module EmojiTestParser
VARIATION_SELECTOR_16 = "\u{fe0f}"
SKIN_TONES = [
"\u{1F3FB}", # light skin tone
"\u{1F3FC}", # medium-light skin tone
"\u{1F3FD}", # medium skin tone
"\u{1F3FE}", # medium-dark skin tone
"\u{1F3FF}", # dark skin tone
]
HAIR_MODIFIERS = [
"\u{1F9B0}", # red-haired
"\u{1F9B1}", # curly-haired
"\u{1F9B2}", # bald
"\u{1F9B3}", # white-haired
]
module_function
def parse
File.open(File.expand_path("../emoji-test.txt", __FILE__), "r:utf-8") do |file|
parse_file(file)
end
end
def parse_file(io)
data = []
emoji_map = {}
category = nil
sub_category = nil
io.each do |line|
begin
if line.start_with?("# group: ")
_, group_name = line.split(":", 2)
category = {
name: group_name.strip,
emoji: [],
}
data << category
sub_category = nil
elsif line.start_with?("# subgroup: ")
_, group_name = line.split(":", 2)
sub_category = {
name: group_name.strip,
emoji: [],
}
category[:emoji] << sub_category
elsif line.start_with?("#") || line.strip.empty?
next
else
row, desc = line.split("#", 2)
desc = desc.strip.split(" ", 2)[1]
codepoints, _ = row.split(";", 2)
emoji_raw = codepoints.strip.split.map { |c| c.hex }.pack("U*")
next if HAIR_MODIFIERS.include?(emoji_raw)
emoji_normalized = emoji_raw
.gsub(VARIATION_SELECTOR_16, "")
.gsub(/(#{SKIN_TONES.join("|")})/o, "")
emoji_item = emoji_map[emoji_normalized]
if desc.end_with?(" skin tone")
emoji_item[:skin_tones] = true if emoji_item
next
end
if emoji_item
emoji_item[:sequences] << emoji_raw
else
emoji_item = {
sequences: [emoji_raw],
description: desc,
}
emoji_map[emoji_normalized] = emoji_item
sub_category[:emoji] << emoji_item
end
end
rescue
warn "line: %p" % line
raise
end
end
[emoji_map, data]
end
end
if $0 == __FILE__
html_output = false
if ARGV[0] == "--html"
ARGV.shift
html_output = true
end
_, categories = EmojiTestParser.parse
trap(:PIPE) { abort }
if html_output
puts "<!doctype html>"
puts "<meta charset=utf-8>"
for category in categories
puts "<h2>#{category[:name]}</h2>"
for sub_category in category[:emoji]
puts "<h3>#{sub_category[:name]}</h3>"
puts "<ol>"
for char in sub_category[:emoji]
puts "<li>"
for sequence in char[:sequences]
codepoints = sequence.unpack("U*").map { |c| c.to_s(16).upcase }.join(" ")
printf '<span class=emoji title="%s">%s</span> ', codepoints, sequence
end
puts "#{char[:description]}</li>"
end
puts "</ol>"
end
end
else
require "json"
puts JSON.pretty_generate(categories)
end
end