Parse emoji-test.txt from Unicode
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -4,6 +4,5 @@
|
||||
.ruby-version
|
||||
Gemfile.lock
|
||||
db/emoji-test.txt
|
||||
db/ucd.nounihan.grouped.xml
|
||||
images/unicode/*.png
|
||||
vendor/
|
||||
|
||||
16
Rakefile
16
Rakefile
@@ -9,10 +9,8 @@ end
|
||||
|
||||
namespace :db do
|
||||
desc %(Generate Emoji data files needed for development)
|
||||
task generate: [
|
||||
'db/ucd.nounihan.grouped.xml',
|
||||
task :generate => [
|
||||
'db/emoji-test.txt',
|
||||
'db/raw-emoji.json'
|
||||
]
|
||||
|
||||
desc %(Dump a list of supported Emoji with Unicode descriptions and aliases)
|
||||
@@ -21,18 +19,6 @@ namespace :db do
|
||||
end
|
||||
end
|
||||
|
||||
file 'db/raw-emoji.json' do |t|
|
||||
system 'curl', '-fsSL', 'https://raw.githack.com/koddsson/emoji.json/11.0.0/emoji.json', '-o', t.name
|
||||
end
|
||||
|
||||
file 'db/ucd.nounihan.grouped.xml' do
|
||||
Dir.chdir('db') do
|
||||
system 'curl', '-fsSLO', 'http://www.unicode.org/Public/11.0.0/ucdxml/ucd.nounihan.grouped.zip'
|
||||
system 'unzip', '-q', 'ucd.nounihan.grouped.zip'
|
||||
rm 'ucd.nounihan.grouped.zip'
|
||||
end
|
||||
end
|
||||
|
||||
file 'db/emoji-test.txt' do |t|
|
||||
system 'curl', '-fsSL', 'http://unicode.org/Public/emoji/11.0/emoji-test.txt', '-o', t.name
|
||||
end
|
||||
|
||||
151
db/dump.rb
151
db/dump.rb
@@ -1,128 +1,41 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'emoji'
|
||||
require 'json'
|
||||
require 'rexml/document'
|
||||
|
||||
class UnicodeCharacter
|
||||
attr_reader :code, :description, :version, :aliases
|
||||
|
||||
class CharListener
|
||||
CHAR_TAG = "char".freeze
|
||||
|
||||
def self.parse(io, &block)
|
||||
REXML::Document.parse_stream(io, self.new(&block))
|
||||
end
|
||||
|
||||
def initialize(&block)
|
||||
@callback = block
|
||||
end
|
||||
|
||||
def tag_start(name, attributes)
|
||||
if CHAR_TAG == name
|
||||
@callback.call(
|
||||
attributes.fetch("cp") { return },
|
||||
attributes.fetch("na") { return },
|
||||
attributes.fetch("age", nil),
|
||||
)
|
||||
end
|
||||
end
|
||||
|
||||
def method_missing(*) end
|
||||
end
|
||||
|
||||
def self.index
|
||||
return @index if defined? @index
|
||||
@index = {}
|
||||
File.open(File.expand_path('../ucd.nounihan.grouped.xml', __FILE__)) do |source|
|
||||
CharListener.parse(source) do |char, desc, age|
|
||||
uc = UnicodeCharacter.new(char, desc, age)
|
||||
@index[uc.code] = uc
|
||||
end
|
||||
end
|
||||
@index
|
||||
end
|
||||
|
||||
def self.fetch(code)
|
||||
code = code.to_s(16).rjust(4, '0') if code.is_a?(Integer)
|
||||
self.index.fetch(code)
|
||||
end
|
||||
|
||||
def initialize(code, description, version)
|
||||
@code = code.downcase
|
||||
@description = description.downcase
|
||||
@version = version
|
||||
@aliases = []
|
||||
@references = []
|
||||
end
|
||||
|
||||
def add_alias(string)
|
||||
@aliases.concat string.split(/\s*,\s*/)
|
||||
end
|
||||
|
||||
def add_reference(code)
|
||||
@references << code.downcase
|
||||
end
|
||||
end
|
||||
|
||||
unless $stdin.tty?
|
||||
codepoints = STDIN.read.chomp.codepoints.map { |code|
|
||||
UnicodeCharacter.fetch(code)
|
||||
}
|
||||
codepoints.each do |char|
|
||||
printf "%5s: %s", char.code.upcase, char.description
|
||||
printf " (%s)", char.version if char.version
|
||||
puts
|
||||
end
|
||||
exit
|
||||
end
|
||||
|
||||
trap(:PIPE) { abort }
|
||||
|
||||
normalize = -> (raw) {
|
||||
raw.sub(Emoji::VARIATION_SELECTOR_16, '')
|
||||
}
|
||||
|
||||
emojidesc = {}
|
||||
File.open(File.expand_path('../emoji-test.txt', __FILE__)) do |file|
|
||||
file.each do |line|
|
||||
next if line =~ /^(#|$)/
|
||||
line = line.chomp.split('# ', 2)[1]
|
||||
emoji, description = line.split(' ', 2)
|
||||
emojidesc[normalize.(emoji)] = description
|
||||
end
|
||||
end
|
||||
require_relative './emoji-test'
|
||||
|
||||
items = []
|
||||
|
||||
for category, emojis in Emoji.palette
|
||||
for raw in emojis
|
||||
emoji = Emoji.find_by_unicode(raw)
|
||||
unicode_version = emoji ? emoji.unicode_version : ''
|
||||
ios_version = emoji ? emoji.ios_version : ''
|
||||
_, categories = EmojiTestParser.parse
|
||||
|
||||
unless raw.include?(Emoji::ZERO_WIDTH_JOINER)
|
||||
uchar = UnicodeCharacter.fetch(raw.codepoints[0])
|
||||
unicode_version = uchar.version unless uchar.version.nil?
|
||||
for category in categories
|
||||
for sub_category in category[:emoji]
|
||||
for emoji_item in sub_category[:emoji]
|
||||
raw = emoji_item[:sequences][0]
|
||||
existing_emoji = Emoji.find_by_unicode(raw) || Emoji.find_by_unicode("#{raw}\u{fe0f}")
|
||||
output_item = {
|
||||
emoji: raw,
|
||||
description: emoji_item[:description],
|
||||
category: category[:name],
|
||||
}
|
||||
if existing_emoji
|
||||
output_item.update(
|
||||
aliases: existing_emoji.aliases,
|
||||
tags: existing_emoji.tags,
|
||||
unicode_version: existing_emoji.unicode_version,
|
||||
ios_version: existing_emoji.ios_version,
|
||||
)
|
||||
else
|
||||
output_item.update(
|
||||
aliases: [emoji_item[:description].gsub(/\W+/, '_').downcase],
|
||||
tags: [],
|
||||
unicode_version: "11.0",
|
||||
ios_version: "12.1",
|
||||
)
|
||||
end
|
||||
output_item[:skin_tones] = true if emoji_item[:skin_tones]
|
||||
items << output_item
|
||||
end
|
||||
|
||||
description = emojidesc.fetch(normalize.(raw))
|
||||
|
||||
if unicode_version == ''
|
||||
warn "#{description} (#{raw}) doesn't have Unicode version"
|
||||
end
|
||||
|
||||
if ios_version == ''
|
||||
ios_version = '10.2'
|
||||
end
|
||||
|
||||
items << {
|
||||
emoji: raw,
|
||||
description: description,
|
||||
category: category,
|
||||
aliases: emoji ? emoji.aliases : [description.gsub(/\W+/, '_').downcase],
|
||||
tags: emoji ? emoji.tags : [],
|
||||
unicode_version: unicode_version,
|
||||
ios_version: ios_version,
|
||||
}
|
||||
end
|
||||
end
|
||||
|
||||
@@ -133,6 +46,8 @@ for emoji in Emoji.all.select(&:custom?)
|
||||
}
|
||||
end
|
||||
|
||||
trap(:PIPE) { abort }
|
||||
|
||||
puts JSON.pretty_generate(items)
|
||||
.gsub("\n\n", "\n")
|
||||
.gsub(/,\n( +)/) { "\n%s, " % $1[2..-1] }
|
||||
|
||||
121
db/emoji-test.rb
Normal file
121
db/emoji-test.rb
Normal file
@@ -0,0 +1,121 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
module EmojiTestParser
|
||||
VARIATION_SELECTOR_16 = "\u{fe0f}"
|
||||
SKIN_TONES = [
|
||||
"\u{1F3FB}", # light skin tone
|
||||
"\u{1F3FC}", # medium-light skin tone
|
||||
"\u{1F3FD}", # medium skin tone
|
||||
"\u{1F3FE}", # medium-dark skin tone
|
||||
"\u{1F3FF}", # dark skin tone
|
||||
]
|
||||
HAIR_MODIFIERS = [
|
||||
"\u{1F9B0}", # red-haired
|
||||
"\u{1F9B1}", # curly-haired
|
||||
"\u{1F9B2}", # bald
|
||||
"\u{1F9B3}", # white-haired
|
||||
]
|
||||
|
||||
module_function
|
||||
|
||||
def parse
|
||||
File.open(File.expand_path("../emoji-test.txt", __FILE__), "r:utf-8") do |file|
|
||||
parse_file(file)
|
||||
end
|
||||
end
|
||||
|
||||
def parse_file(io)
|
||||
data = []
|
||||
emoji_map = {}
|
||||
category = nil
|
||||
sub_category = nil
|
||||
|
||||
io.each do |line|
|
||||
begin
|
||||
if line.start_with?("# group: ")
|
||||
_, group_name = line.split(":", 2)
|
||||
category = {
|
||||
name: group_name.strip,
|
||||
emoji: [],
|
||||
}
|
||||
data << category
|
||||
sub_category = nil
|
||||
elsif line.start_with?("# subgroup: ")
|
||||
_, group_name = line.split(":", 2)
|
||||
sub_category = {
|
||||
name: group_name.strip,
|
||||
emoji: [],
|
||||
}
|
||||
category[:emoji] << sub_category
|
||||
elsif line.start_with?("#") || line.strip.empty?
|
||||
next
|
||||
else
|
||||
row, desc = line.split("#", 2)
|
||||
desc = desc.strip.split(" ", 2)[1]
|
||||
codepoints, _ = row.split(";", 2)
|
||||
emoji_raw = codepoints.strip.split.map { |c| c.hex }.pack("U*")
|
||||
next if HAIR_MODIFIERS.include?(emoji_raw)
|
||||
emoji_normalized = emoji_raw
|
||||
.gsub(VARIATION_SELECTOR_16, "")
|
||||
.gsub(/(#{SKIN_TONES.join("|")})/o, "")
|
||||
emoji_item = emoji_map[emoji_normalized]
|
||||
if desc.end_with?(" skin tone")
|
||||
emoji_item[:skin_tones] = true if emoji_item
|
||||
next
|
||||
end
|
||||
if emoji_item
|
||||
emoji_item[:sequences] << emoji_raw
|
||||
else
|
||||
emoji_item = {
|
||||
sequences: [emoji_raw],
|
||||
description: desc,
|
||||
}
|
||||
emoji_map[emoji_normalized] = emoji_item
|
||||
sub_category[:emoji] << emoji_item
|
||||
end
|
||||
end
|
||||
rescue
|
||||
warn "line: %p" % line
|
||||
raise
|
||||
end
|
||||
end
|
||||
|
||||
[emoji_map, data]
|
||||
end
|
||||
end
|
||||
|
||||
if $0 == __FILE__
|
||||
html_output = false
|
||||
if ARGV[0] == "--html"
|
||||
ARGV.shift
|
||||
html_output = true
|
||||
end
|
||||
|
||||
_, categories = EmojiTestParser.parse
|
||||
|
||||
trap(:PIPE) { abort }
|
||||
|
||||
if html_output
|
||||
puts "<!doctype html>"
|
||||
puts "<meta charset=utf-8>"
|
||||
for category in categories
|
||||
puts "<h2>#{category[:name]}</h2>"
|
||||
for sub_category in category[:emoji]
|
||||
puts "<h3>#{sub_category[:name]}</h3>"
|
||||
puts "<ol>"
|
||||
for char in sub_category[:emoji]
|
||||
puts "<li>"
|
||||
for sequence in char[:sequences]
|
||||
codepoints = sequence.unpack("U*").map { |c| c.to_s(16).upcase }.join(" ")
|
||||
printf '<span class=emoji title="%s">%s</span> ', codepoints, sequence
|
||||
end
|
||||
puts "#{char[:description]}</li>"
|
||||
end
|
||||
puts "</ol>"
|
||||
end
|
||||
end
|
||||
else
|
||||
require "json"
|
||||
puts JSON.pretty_generate(categories)
|
||||
end
|
||||
end
|
||||
Reference in New Issue
Block a user