Compare commits

...

1 Commits

Author SHA1 Message Date
Vicent Marti
1e85635643 Replace emoji from a LUT 2015-05-18 17:41:13 +02:00
9 changed files with 288 additions and 0 deletions

View File

@@ -1,4 +1,5 @@
require 'rake/testtask'
require 'rake/extensiontask'
task :default => :test
@@ -36,3 +37,16 @@ namespace :images do
Emoji::Extractor.new(64, "#{gem_dir}/images/emoji/unicode").extract!
end
end
namespace :c do
task :headers do
require 'emoji/tables'
gem_dir = File.dirname(File.realpath(__FILE__))
File.open(File.join(gem_dir, "ext/gemoji/emoji.h"), "w") do |file|
file.puts(Emoji::Tables.generate_length_tables)
end
end
end
Rake::ExtensionTask.new('gemoji')

11
ext/gemoji/emoji.h Normal file
View File

@@ -0,0 +1,11 @@
static const long emoji_byte_lengths[][5] = {
{0},
{11, 8, 7, 4, 0},
{6, 3, 0},
{7, 0},
{6, 0},
{5, 0},
};
static const int8_t emoji_magic_bytes[] =
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

6
ext/gemoji/extconf.rb Normal file
View File

@@ -0,0 +1,6 @@
require 'mkmf'
$CFLAGS << ' -ggdb3 -O0 '
dir_config('gemoji')
create_makefile('gemoji')

125
ext/gemoji/gemoji.c Normal file
View File

@@ -0,0 +1,125 @@
#include <ruby.h>
#include <ruby/encoding.h>
#include "emoji.h"
#define unlikely(x) __builtin_expect((x),0)
static long
lookup_emoji(VALUE *rb_emoji, VALUE rb_bytes, VALUE rb_unicode_map, const long *possible_len)
{
const uint8_t *src = (uint8_t *)RSTRING_PTR(rb_bytes);
for (; *possible_len; ++possible_len) {
const long emoji_size = *possible_len;
if (emoji_size > RSTRING_LEN(rb_bytes))
continue;
if ((src[emoji_size - 1] & 0xC0) != 0x80)
continue;
rb_str_set_len(rb_bytes, emoji_size);
*rb_emoji = rb_hash_lookup(rb_unicode_map, rb_bytes);
if (!NIL_P(*rb_emoji))
return emoji_size;
}
return 0;
}
static VALUE
replace_emoji(const uint8_t *src, long size, VALUE rb_unicode_map)
{
VALUE rb_emoji, rb_bytes, rb_out = Qnil;
long i = 0, org, emoji_len;
int8_t emoji_byte;
while (i < size) {
org = i;
retry_search:
while (i < size && (emoji_byte = emoji_magic_bytes[(int)src[i]]) == 0)
i++;
if (i + 1 < size && (src[i + 1] & 0x80) != 0x80) {
i++;
goto retry_search;
}
if (unlikely(org == 0)) {
if (i == size)
return Qnil;
rb_out = rb_str_buf_new(size * 4 / 3);
rb_enc_associate(rb_out, rb_utf8_encoding());
rb_bytes = rb_str_buf_new(16);
rb_enc_associate(rb_bytes, rb_utf8_encoding());
}
if (i > org)
rb_str_buf_cat(rb_out, (const char *)src + org, i - org);
if (unlikely(i == size))
break;
emoji_len = size - i;
if (emoji_len > 12)
emoji_len = 12;
memcpy(RSTRING_PTR(rb_bytes), src + i, emoji_len);
rb_str_set_len(rb_bytes, emoji_len);
emoji_len = lookup_emoji(&rb_emoji, rb_bytes, rb_unicode_map, emoji_byte_lengths[(int)emoji_byte]);
if (emoji_len) {
VALUE rb_repl = rb_yield(rb_emoji);
if (NIL_P(rb_repl)) {
rb_str_buf_cat(rb_out, (const char *)src + i, emoji_len);
} else {
Check_Type(rb_repl, T_STRING);
rb_str_buf_append(rb_out, rb_repl);
}
i += emoji_len;
continue;
}
rb_str_buf_cat(rb_out, (const char *)src + i, 1);
i++;
}
return rb_out;
}
static VALUE
rb_gemoji_replace_unicode(VALUE klass, VALUE rb_source)
{
VALUE rb_output;
VALUE rb_unicode_map = rb_funcall(klass, rb_intern("unicodes_index"), 0);
Check_Type(rb_source, T_STRING);
Check_Type(rb_unicode_map, T_HASH);
rb_must_asciicompat(rb_source);
if (ENC_CODERANGE_ASCIIONLY(rb_source))
return rb_source;
if (rb_enc_get(rb_source) != rb_utf8_encoding())
rb_raise(rb_eEncCompatError, "expected UTF-8 encoding");
rb_output = replace_emoji((uint8_t *)RSTRING_PTR(rb_source), RSTRING_LEN(rb_source), rb_unicode_map);
if (NIL_P(rb_output))
return rb_source;
return rb_output;
}
void Init_gemoji(void)
{
VALUE rb_mEmoji = rb_define_module("Emoji");
rb_define_method(rb_mEmoji, "gsub_unicode", rb_gemoji_replace_unicode, 1);
}

View File

@@ -1,5 +1,6 @@
require 'emoji/character'
require 'json'
require 'gemoji.so'
module Emoji
extend self

43
lib/emoji/tables.rb Normal file
View File

@@ -0,0 +1,43 @@
require 'emoji'
module Emoji
module Tables
def self.all_byte_sequences
@all_byte_sequences ||= begin
all = Emoji.all.flat_map { |e| e.unicode_aliases }.compact
all.map { |e| e.bytes }
end
end
def self.generate_length_tables
groups = all_byte_sequences.group_by { |seq| seq.first }
groups.each do |k, v|
v.map! { |seq| seq.size }
v.uniq!
v.sort!
v.reverse!
end
groups = groups.reduce({}) { |h, (k,v)| (h[v] ||= []) << k; h}
byte_array = Array.new(256) { 0 }
tags_width = groups.keys.map { |k| k.size }.max + 1
code = "static const long emoji_byte_lengths[][#{tags_width}] = {\n"
code << "\t{0},\n"
groups.each_with_index do |(len_tags, magic_bytes), idx|
code << "\t{" + (len_tags + [0]).join(', ') + "},\n"
magic_bytes.each do |b|
byte_array[b] = idx + 1
end
end
code << "};\n\n"
code << "static const int8_t emoji_magic_bytes[] =\n"
code << "{" + byte_array.map(&:to_s).join(', ') + "};\n"
code
end
end
end

15
lib/tasks/chelpers.rake Normal file
View File

@@ -0,0 +1,15 @@
require 'json'
desc "generate helpers for the C extension"
task :c_helpers do
emoji_file = "#{Rake.original_dir}/db/emoji.json"
emoji = JSON.parse(File.read(emoji_file))
all_emojis = emoji.map { |e| e['emoji'] }.compact
all_emojis.map! { |e| e.bytes }
magic_bytes = Array(256) { 0 }
all_emojis.each { |bytes| magic_bytes[bytes.first] = 1 }
puts magic_bytes.inspect
end

42
script/benchmark Normal file
View File

@@ -0,0 +1,42 @@
#!/usr/bin/env ruby
require 'emoji'
require 'benchmark'
# Build a regexp that matches all native emoji characters.
# Some emoji code point sequences are prefixes of other emoji code point
# sequences, e.g.:
# U+2728 SPARKLES
# vs.
# U+2728 SPARKLES U+FE0F VARIATION SELECTOR-16
# We sort the code point sequences longest-first so that the regex will
# match the longest possible sequence.
def unicodes_pattern
$unicodes_pattern ||= Regexp.new(emoji_unicodes.sort_by(&:length).reverse.join("|"))
end
def emoji_unicodes
Emoji.all.flat_map(&:unicode_aliases)
end
def unicode_emoji_filter(text)
text.gsub(unicodes_pattern) do |unicode|
emoji = Emoji.find_by_unicode(unicode)
"<g-emoji alias='#{emoji.name}'>"
end
end
def gsub_unicode(text)
Emoji.gsub_unicode(text) do |emoji|
"<g-emoji alias='#{emoji.name}'>"
end
end
data_file = File.join(File.dirname(File.realpath(__FILE__)), '../db/emoji.json')
raw = File.open(data_file, 'r:UTF-8') { |file| file.read }
raw = raw * 100
puts "Benchmarking #{raw.bytesize} bytes..."
puts Benchmark.measure('unicode_emoji_filter') { unicode_emoji_filter(raw) }
puts Benchmark.measure('Emoji#gsub_unicode') { gsub_unicode(raw) }

31
test/gsub_test.rb Normal file
View File

@@ -0,0 +1,31 @@
require_relative './test_helper'
class GsubTest < TestCase
def random_letters
@random_letters ||= ('a'..'z').to_a + ('A'..'Z').to_a + ('0'..'9').to_a + [' '] * 10 + ['%', '-']
end
def random_string(n)
random_letters.sample(n).join
end
def all_emojis
@all_emojis ||= Emoji.all.flat_map { |e| e.unicode_aliases }.compact
end
def test_replace_fuzz_testing
emoji = all_emojis.shuffle
fuzz = ""
expected = ""
emoji.each do |emoji|
rnd = random_string(rand(20))
fuzz << rnd << emoji
expected << rnd << ":#{Emoji.find_by_unicode(emoji).name}:"
end
result = Emoji.gsub_unicode(fuzz) { |emoji| ":#{emoji.name}:" }
assert_equal expected, result
end
end