Replace emoji from a LUT

2015-05-18 17:41:13 +02:00
9 changed files with 288 additions and 0 deletions
--- a/14
+++ b/14
@@ -1,4 +1,5 @@
 require 'rake/testtask'
+require 'rake/extensiontask'

 task :default => :test

@@ -36,3 +37,16 @@ namespace :images do
    Emoji::Extractor.new(64, "#{gem_dir}/images/emoji/unicode").extract!
  end
 end
+
+namespace :c do
+  task :headers do
+    require 'emoji/tables'
+    gem_dir = File.dirname(File.realpath(__FILE__))
+
+    File.open(File.join(gem_dir, "ext/gemoji/emoji.h"), "w") do |file|
+      file.puts(Emoji::Tables.generate_length_tables)
+    end
+  end
+end
+
+Rake::ExtensionTask.new('gemoji')
--- a/ext/gemoji/emoji.h
+++ b/ext/gemoji/emoji.h
@@ -0,0 +1,11 @@
+static const long emoji_byte_lengths[][5] = {
+	{0},
+	{11, 8, 7, 4, 0},
+	{6, 3, 0},
+	{7, 0},
+	{6, 0},
+	{5, 0},
+};
+
+static const int8_t emoji_magic_bytes[] =
+{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
--- a/ext/gemoji/extconf.rb
+++ b/ext/gemoji/extconf.rb
@@ -0,0 +1,6 @@
+require 'mkmf'
+
+$CFLAGS << ' -ggdb3 -O0 '
+
+dir_config('gemoji')
+create_makefile('gemoji')
--- a/ext/gemoji/gemoji.c
+++ b/ext/gemoji/gemoji.c
@@ -0,0 +1,125 @@
+#include <ruby.h>
+#include <ruby/encoding.h>
+#include "emoji.h"
+
+#define unlikely(x) __builtin_expect((x),0)
+
+static long
+lookup_emoji(VALUE *rb_emoji, VALUE rb_bytes, VALUE rb_unicode_map, const long *possible_len)
+{
+	const uint8_t *src = (uint8_t *)RSTRING_PTR(rb_bytes);
+
+	for (; *possible_len; ++possible_len) {
+		const long emoji_size = *possible_len;
+
+		if (emoji_size > RSTRING_LEN(rb_bytes))
+			continue;
+
+		if ((src[emoji_size - 1] & 0xC0) != 0x80)
+			continue;
+
+		rb_str_set_len(rb_bytes, emoji_size);
+
+		*rb_emoji = rb_hash_lookup(rb_unicode_map, rb_bytes);
+		if (!NIL_P(*rb_emoji))
+			return emoji_size;
+	}
+
+	return 0;
+}
+
+static VALUE
+replace_emoji(const uint8_t *src, long size, VALUE rb_unicode_map)
+{
+	VALUE rb_emoji, rb_bytes, rb_out = Qnil;
+	long i = 0, org, emoji_len;
+	int8_t emoji_byte;
+
+	while (i < size) {
+		org = i;
+
+retry_search:
+		while (i < size && (emoji_byte = emoji_magic_bytes[(int)src[i]]) == 0)
+			i++;
+
+		if (i + 1 < size && (src[i + 1] & 0x80) != 0x80) {
+			i++;
+			goto retry_search;
+		}
+
+		if (unlikely(org == 0)) {
+			if (i == size)
+				return Qnil;
+
+			rb_out = rb_str_buf_new(size * 4 / 3);
+			rb_enc_associate(rb_out, rb_utf8_encoding());
+
+			rb_bytes = rb_str_buf_new(16);
+			rb_enc_associate(rb_bytes, rb_utf8_encoding());
+		}
+
+		if (i > org)
+			rb_str_buf_cat(rb_out, (const char *)src + org, i - org);
+
+		if (unlikely(i == size))
+			break;
+
+		emoji_len = size - i;
+		if (emoji_len > 12)
+			emoji_len = 12;
+
+		memcpy(RSTRING_PTR(rb_bytes), src + i, emoji_len);
+		rb_str_set_len(rb_bytes, emoji_len);
+
+		emoji_len = lookup_emoji(&rb_emoji, rb_bytes, rb_unicode_map, emoji_byte_lengths[(int)emoji_byte]);
+
+		if (emoji_len) {
+			VALUE rb_repl = rb_yield(rb_emoji);
+
+			if (NIL_P(rb_repl)) {
+				rb_str_buf_cat(rb_out, (const char *)src + i, emoji_len);
+			} else {
+				Check_Type(rb_repl, T_STRING);
+				rb_str_buf_append(rb_out, rb_repl);
+			}
+
+			i += emoji_len;
+			continue;
+		}
+
+		rb_str_buf_cat(rb_out, (const char *)src + i, 1);
+		i++;
+	}
+
+	return rb_out;
+}
+
+static VALUE
+rb_gemoji_replace_unicode(VALUE klass, VALUE rb_source)
+{
+	VALUE rb_output;
+	VALUE rb_unicode_map = rb_funcall(klass, rb_intern("unicodes_index"), 0);
+
+	Check_Type(rb_source, T_STRING);
+	Check_Type(rb_unicode_map, T_HASH);
+
+	rb_must_asciicompat(rb_source);
+
+	if (ENC_CODERANGE_ASCIIONLY(rb_source))
+		return rb_source;
+
+	if (rb_enc_get(rb_source) != rb_utf8_encoding())
+		rb_raise(rb_eEncCompatError, "expected UTF-8 encoding");
+
+	rb_output = replace_emoji((uint8_t *)RSTRING_PTR(rb_source), RSTRING_LEN(rb_source), rb_unicode_map);
+	if (NIL_P(rb_output))
+		return rb_source;
+
+	return rb_output;
+}
+
+void Init_gemoji(void)
+{
+	VALUE rb_mEmoji = rb_define_module("Emoji");
+	rb_define_method(rb_mEmoji, "gsub_unicode", rb_gemoji_replace_unicode, 1);
+}
--- a/lib/emoji.rb
+++ b/lib/emoji.rb
@@ -1,5 +1,6 @@
 require 'emoji/character'
 require 'json'
+require 'gemoji.so'

 module Emoji
  extend self
--- a/lib/emoji/tables.rb
+++ b/lib/emoji/tables.rb
@@ -0,0 +1,43 @@
+require 'emoji'
+
+module Emoji
+  module Tables
+    def self.all_byte_sequences
+      @all_byte_sequences ||= begin
+        all = Emoji.all.flat_map { |e| e.unicode_aliases }.compact
+        all.map { |e| e.bytes }
+      end
+    end
+
+    def self.generate_length_tables
+      groups = all_byte_sequences.group_by { |seq| seq.first }
+
+      groups.each do |k, v|
+        v.map! { |seq| seq.size }
+        v.uniq!
+        v.sort!
+        v.reverse!
+      end
+
+      groups = groups.reduce({}) { |h, (k,v)| (h[v] ||= []) << k; h}
+      byte_array = Array.new(256) { 0 }
+      tags_width = groups.keys.map { |k| k.size }.max + 1
+
+      code = "static const long emoji_byte_lengths[][#{tags_width}] = {\n"
+      code << "\t{0},\n"
+
+      groups.each_with_index do |(len_tags, magic_bytes), idx|
+        code << "\t{" + (len_tags + [0]).join(', ') + "},\n" 
+
+        magic_bytes.each do |b|
+          byte_array[b] = idx + 1
+        end
+      end
+
+      code << "};\n\n"
+      code << "static const int8_t emoji_magic_bytes[] =\n"
+      code << "{" + byte_array.map(&:to_s).join(', ') + "};\n"
+      code
+    end
+  end
+end
--- a/lib/tasks/chelpers.rake
+++ b/lib/tasks/chelpers.rake
@@ -0,0 +1,15 @@
+require 'json'
+
+desc "generate helpers for the C extension"
+task :c_helpers do
+  emoji_file = "#{Rake.original_dir}/db/emoji.json"
+  emoji = JSON.parse(File.read(emoji_file))
+
+  all_emojis = emoji.map { |e| e['emoji'] }.compact
+  all_emojis.map! { |e| e.bytes }
+
+  magic_bytes = Array(256) { 0 }
+  all_emojis.each { |bytes| magic_bytes[bytes.first] = 1 }
+
+  puts magic_bytes.inspect
+end
--- a/script/benchmark
+++ b/script/benchmark
@@ -0,0 +1,42 @@
+#!/usr/bin/env ruby
+
+require 'emoji'
+require 'benchmark'
+
+# Build a regexp that matches all native emoji characters.
+# Some emoji code point sequences are prefixes of other emoji code point
+# sequences, e.g.:
+# U+2728 SPARKLES
+# vs.
+# U+2728 SPARKLES U+FE0F VARIATION SELECTOR-16
+# We sort the code point sequences longest-first so that the regex will
+# match the longest possible sequence.
+def unicodes_pattern
+  $unicodes_pattern ||= Regexp.new(emoji_unicodes.sort_by(&:length).reverse.join("|"))
+end
+
+def emoji_unicodes
+  Emoji.all.flat_map(&:unicode_aliases)
+end
+
+def unicode_emoji_filter(text)
+  text.gsub(unicodes_pattern) do |unicode|
+    emoji = Emoji.find_by_unicode(unicode)
+    "<g-emoji alias='#{emoji.name}'>" 
+  end
+end
+
+def gsub_unicode(text)
+  Emoji.gsub_unicode(text) do |emoji|
+    "<g-emoji alias='#{emoji.name}'>" 
+  end
+end
+
+data_file = File.join(File.dirname(File.realpath(__FILE__)), '../db/emoji.json')
+raw = File.open(data_file, 'r:UTF-8') { |file| file.read }
+raw = raw * 100
+
+puts "Benchmarking #{raw.bytesize} bytes..."
+
+puts Benchmark.measure('unicode_emoji_filter') { unicode_emoji_filter(raw) }
+puts Benchmark.measure('Emoji#gsub_unicode') { gsub_unicode(raw) }
--- a/test/gsub_test.rb
+++ b/test/gsub_test.rb
@@ -0,0 +1,31 @@
+require_relative './test_helper'
+
+class GsubTest < TestCase
+
+  def random_letters
+    @random_letters ||= ('a'..'z').to_a + ('A'..'Z').to_a + ('0'..'9').to_a + [' '] * 10 + ['%', '-']
+  end
+
+  def random_string(n)
+    random_letters.sample(n).join
+  end
+
+  def all_emojis
+    @all_emojis ||= Emoji.all.flat_map { |e| e.unicode_aliases }.compact
+  end
+
+  def test_replace_fuzz_testing
+    emoji = all_emojis.shuffle
+    fuzz = ""
+    expected = ""
+
+    emoji.each do |emoji|
+      rnd = random_string(rand(20))
+      fuzz << rnd << emoji
+      expected << rnd << ":#{Emoji.find_by_unicode(emoji).name}:"
+    end
+
+    result = Emoji.gsub_unicode(fuzz) { |emoji| ":#{emoji.name}:" }
+    assert_equal expected, result
+  end
+end