diff --git a/Rakefile b/Rakefile index 2cea5c9..efffcc1 100644 --- a/Rakefile +++ b/Rakefile @@ -1,4 +1,5 @@ require 'rake/testtask' +require 'rake/extensiontask' task :default => :test @@ -36,3 +37,16 @@ namespace :images do Emoji::Extractor.new(64, "#{gem_dir}/images/emoji/unicode").extract! end end + +namespace :c do + task :headers do + require 'emoji/tables' + gem_dir = File.dirname(File.realpath(__FILE__)) + + File.open(File.join(gem_dir, "ext/gemoji/emoji.h"), "w") do |file| + file.puts(Emoji::Tables.generate_length_tables) + end + end +end + +Rake::ExtensionTask.new('gemoji') diff --git a/ext/gemoji/emoji.h b/ext/gemoji/emoji.h new file mode 100644 index 0000000..de23f1b --- /dev/null +++ b/ext/gemoji/emoji.h @@ -0,0 +1,11 @@ +static const long emoji_byte_lengths[][5] = { + {0}, + {11, 8, 7, 4, 0}, + {6, 3, 0}, + {7, 0}, + {6, 0}, + {5, 0}, +}; + +static const int8_t emoji_magic_bytes[] = +{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; diff --git a/ext/gemoji/extconf.rb b/ext/gemoji/extconf.rb new file mode 100644 index 0000000..0287b01 --- /dev/null +++ b/ext/gemoji/extconf.rb @@ -0,0 +1,6 @@ +require 'mkmf' + +$CFLAGS << ' -ggdb3 -O0 ' + +dir_config('gemoji') +create_makefile('gemoji') diff --git a/ext/gemoji/gemoji.c b/ext/gemoji/gemoji.c new file mode 100644 index 0000000..9e2ff84 --- /dev/null +++ b/ext/gemoji/gemoji.c @@ -0,0 +1,125 @@ +#include +#include +#include "emoji.h" + +#define unlikely(x) __builtin_expect((x),0) + +static long +lookup_emoji(VALUE *rb_emoji, VALUE rb_bytes, VALUE rb_unicode_map, const long *possible_len) +{ + const uint8_t *src = (uint8_t *)RSTRING_PTR(rb_bytes); + + for (; *possible_len; ++possible_len) { + const long emoji_size = *possible_len; + + if (emoji_size > RSTRING_LEN(rb_bytes)) + continue; + + if ((src[emoji_size - 1] & 0xC0) != 0x80) + continue; + + rb_str_set_len(rb_bytes, emoji_size); + + *rb_emoji = rb_hash_lookup(rb_unicode_map, rb_bytes); + if (!NIL_P(*rb_emoji)) + return emoji_size; + } + + return 0; +} + +static VALUE +replace_emoji(const uint8_t *src, long size, VALUE rb_unicode_map) +{ + VALUE rb_emoji, rb_bytes, rb_out = Qnil; + long i = 0, org, emoji_len; + int8_t emoji_byte; + + while (i < size) { + org = i; + +retry_search: + while (i < size && (emoji_byte = emoji_magic_bytes[(int)src[i]]) == 0) + i++; + + if (i + 1 < size && (src[i + 1] & 0x80) != 0x80) { + i++; + goto retry_search; + } + + if (unlikely(org == 0)) { + if (i == size) + return Qnil; + + rb_out = rb_str_buf_new(size * 4 / 3); + rb_enc_associate(rb_out, rb_utf8_encoding()); + + rb_bytes = rb_str_buf_new(16); + rb_enc_associate(rb_bytes, rb_utf8_encoding()); + } + + if (i > org) + rb_str_buf_cat(rb_out, (const char *)src + org, i - org); + + if (unlikely(i == size)) + break; + + emoji_len = size - i; + if (emoji_len > 12) + emoji_len = 12; + + memcpy(RSTRING_PTR(rb_bytes), src + i, emoji_len); + rb_str_set_len(rb_bytes, emoji_len); + + emoji_len = lookup_emoji(&rb_emoji, rb_bytes, rb_unicode_map, emoji_byte_lengths[(int)emoji_byte]); + + if (emoji_len) { + VALUE rb_repl = rb_yield(rb_emoji); + + if (NIL_P(rb_repl)) { + rb_str_buf_cat(rb_out, (const char *)src + i, emoji_len); + } else { + Check_Type(rb_repl, T_STRING); + rb_str_buf_append(rb_out, rb_repl); + } + + i += emoji_len; + continue; + } + + rb_str_buf_cat(rb_out, (const char *)src + i, 1); + i++; + } + + return rb_out; +} + +static VALUE +rb_gemoji_replace_unicode(VALUE klass, VALUE rb_source) +{ + VALUE rb_output; + VALUE rb_unicode_map = rb_funcall(klass, rb_intern("unicodes_index"), 0); + + Check_Type(rb_source, T_STRING); + Check_Type(rb_unicode_map, T_HASH); + + rb_must_asciicompat(rb_source); + + if (ENC_CODERANGE_ASCIIONLY(rb_source)) + return rb_source; + + if (rb_enc_get(rb_source) != rb_utf8_encoding()) + rb_raise(rb_eEncCompatError, "expected UTF-8 encoding"); + + rb_output = replace_emoji((uint8_t *)RSTRING_PTR(rb_source), RSTRING_LEN(rb_source), rb_unicode_map); + if (NIL_P(rb_output)) + return rb_source; + + return rb_output; +} + +void Init_gemoji(void) +{ + VALUE rb_mEmoji = rb_define_module("Emoji"); + rb_define_method(rb_mEmoji, "gsub_unicode", rb_gemoji_replace_unicode, 1); +} diff --git a/lib/emoji.rb b/lib/emoji.rb index c03e47e..40d5e50 100644 --- a/lib/emoji.rb +++ b/lib/emoji.rb @@ -1,5 +1,6 @@ require 'emoji/character' require 'json' +require 'gemoji.so' module Emoji extend self diff --git a/lib/emoji/tables.rb b/lib/emoji/tables.rb new file mode 100644 index 0000000..88ccba5 --- /dev/null +++ b/lib/emoji/tables.rb @@ -0,0 +1,43 @@ +require 'emoji' + +module Emoji + module Tables + def self.all_byte_sequences + @all_byte_sequences ||= begin + all = Emoji.all.flat_map { |e| e.unicode_aliases }.compact + all.map { |e| e.bytes } + end + end + + def self.generate_length_tables + groups = all_byte_sequences.group_by { |seq| seq.first } + + groups.each do |k, v| + v.map! { |seq| seq.size } + v.uniq! + v.sort! + v.reverse! + end + + groups = groups.reduce({}) { |h, (k,v)| (h[v] ||= []) << k; h} + byte_array = Array.new(256) { 0 } + tags_width = groups.keys.map { |k| k.size }.max + 1 + + code = "static const long emoji_byte_lengths[][#{tags_width}] = {\n" + code << "\t{0},\n" + + groups.each_with_index do |(len_tags, magic_bytes), idx| + code << "\t{" + (len_tags + [0]).join(', ') + "},\n" + + magic_bytes.each do |b| + byte_array[b] = idx + 1 + end + end + + code << "};\n\n" + code << "static const int8_t emoji_magic_bytes[] =\n" + code << "{" + byte_array.map(&:to_s).join(', ') + "};\n" + code + end + end +end diff --git a/lib/tasks/chelpers.rake b/lib/tasks/chelpers.rake new file mode 100644 index 0000000..5819a5f --- /dev/null +++ b/lib/tasks/chelpers.rake @@ -0,0 +1,15 @@ +require 'json' + +desc "generate helpers for the C extension" +task :c_helpers do + emoji_file = "#{Rake.original_dir}/db/emoji.json" + emoji = JSON.parse(File.read(emoji_file)) + + all_emojis = emoji.map { |e| e['emoji'] }.compact + all_emojis.map! { |e| e.bytes } + + magic_bytes = Array(256) { 0 } + all_emojis.each { |bytes| magic_bytes[bytes.first] = 1 } + + puts magic_bytes.inspect +end diff --git a/script/benchmark b/script/benchmark new file mode 100644 index 0000000..629d402 --- /dev/null +++ b/script/benchmark @@ -0,0 +1,42 @@ +#!/usr/bin/env ruby + +require 'emoji' +require 'benchmark' + +# Build a regexp that matches all native emoji characters. +# Some emoji code point sequences are prefixes of other emoji code point +# sequences, e.g.: +# U+2728 SPARKLES +# vs. +# U+2728 SPARKLES U+FE0F VARIATION SELECTOR-16 +# We sort the code point sequences longest-first so that the regex will +# match the longest possible sequence. +def unicodes_pattern + $unicodes_pattern ||= Regexp.new(emoji_unicodes.sort_by(&:length).reverse.join("|")) +end + +def emoji_unicodes + Emoji.all.flat_map(&:unicode_aliases) +end + +def unicode_emoji_filter(text) + text.gsub(unicodes_pattern) do |unicode| + emoji = Emoji.find_by_unicode(unicode) + "" + end +end + +def gsub_unicode(text) + Emoji.gsub_unicode(text) do |emoji| + "" + end +end + +data_file = File.join(File.dirname(File.realpath(__FILE__)), '../db/emoji.json') +raw = File.open(data_file, 'r:UTF-8') { |file| file.read } +raw = raw * 100 + +puts "Benchmarking #{raw.bytesize} bytes..." + +puts Benchmark.measure('unicode_emoji_filter') { unicode_emoji_filter(raw) } +puts Benchmark.measure('Emoji#gsub_unicode') { gsub_unicode(raw) } diff --git a/test/gsub_test.rb b/test/gsub_test.rb new file mode 100644 index 0000000..fed9c25 --- /dev/null +++ b/test/gsub_test.rb @@ -0,0 +1,31 @@ +require_relative './test_helper' + +class GsubTest < TestCase + + def random_letters + @random_letters ||= ('a'..'z').to_a + ('A'..'Z').to_a + ('0'..'9').to_a + [' '] * 10 + ['%', '-'] + end + + def random_string(n) + random_letters.sample(n).join + end + + def all_emojis + @all_emojis ||= Emoji.all.flat_map { |e| e.unicode_aliases }.compact + end + + def test_replace_fuzz_testing + emoji = all_emojis.shuffle + fuzz = "" + expected = "" + + emoji.each do |emoji| + rnd = random_string(rand(20)) + fuzz << rnd << emoji + expected << rnd << ":#{Emoji.find_by_unicode(emoji).name}:" + end + + result = Emoji.gsub_unicode(fuzz) { |emoji| ":#{emoji.name}:" } + assert_equal expected, result + end +end