Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1e85635643 |
14
Rakefile
14
Rakefile
@@ -1,4 +1,5 @@
|
||||
require 'rake/testtask'
|
||||
require 'rake/extensiontask'
|
||||
|
||||
task :default => :test
|
||||
|
||||
@@ -36,3 +37,16 @@ namespace :images do
|
||||
Emoji::Extractor.new(64, "#{gem_dir}/images/emoji/unicode").extract!
|
||||
end
|
||||
end
|
||||
|
||||
namespace :c do
|
||||
task :headers do
|
||||
require 'emoji/tables'
|
||||
gem_dir = File.dirname(File.realpath(__FILE__))
|
||||
|
||||
File.open(File.join(gem_dir, "ext/gemoji/emoji.h"), "w") do |file|
|
||||
file.puts(Emoji::Tables.generate_length_tables)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
Rake::ExtensionTask.new('gemoji')
|
||||
|
||||
11
ext/gemoji/emoji.h
Normal file
11
ext/gemoji/emoji.h
Normal file
@@ -0,0 +1,11 @@
|
||||
static const long emoji_byte_lengths[][5] = {
|
||||
{0},
|
||||
{11, 8, 7, 4, 0},
|
||||
{6, 3, 0},
|
||||
{7, 0},
|
||||
{6, 0},
|
||||
{5, 0},
|
||||
};
|
||||
|
||||
static const int8_t emoji_magic_bytes[] =
|
||||
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
|
||||
6
ext/gemoji/extconf.rb
Normal file
6
ext/gemoji/extconf.rb
Normal file
@@ -0,0 +1,6 @@
|
||||
require 'mkmf'
|
||||
|
||||
$CFLAGS << ' -ggdb3 -O0 '
|
||||
|
||||
dir_config('gemoji')
|
||||
create_makefile('gemoji')
|
||||
125
ext/gemoji/gemoji.c
Normal file
125
ext/gemoji/gemoji.c
Normal file
@@ -0,0 +1,125 @@
|
||||
#include <ruby.h>
|
||||
#include <ruby/encoding.h>
|
||||
#include "emoji.h"
|
||||
|
||||
#define unlikely(x) __builtin_expect((x),0)
|
||||
|
||||
static long
|
||||
lookup_emoji(VALUE *rb_emoji, VALUE rb_bytes, VALUE rb_unicode_map, const long *possible_len)
|
||||
{
|
||||
const uint8_t *src = (uint8_t *)RSTRING_PTR(rb_bytes);
|
||||
|
||||
for (; *possible_len; ++possible_len) {
|
||||
const long emoji_size = *possible_len;
|
||||
|
||||
if (emoji_size > RSTRING_LEN(rb_bytes))
|
||||
continue;
|
||||
|
||||
if ((src[emoji_size - 1] & 0xC0) != 0x80)
|
||||
continue;
|
||||
|
||||
rb_str_set_len(rb_bytes, emoji_size);
|
||||
|
||||
*rb_emoji = rb_hash_lookup(rb_unicode_map, rb_bytes);
|
||||
if (!NIL_P(*rb_emoji))
|
||||
return emoji_size;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static VALUE
|
||||
replace_emoji(const uint8_t *src, long size, VALUE rb_unicode_map)
|
||||
{
|
||||
VALUE rb_emoji, rb_bytes, rb_out = Qnil;
|
||||
long i = 0, org, emoji_len;
|
||||
int8_t emoji_byte;
|
||||
|
||||
while (i < size) {
|
||||
org = i;
|
||||
|
||||
retry_search:
|
||||
while (i < size && (emoji_byte = emoji_magic_bytes[(int)src[i]]) == 0)
|
||||
i++;
|
||||
|
||||
if (i + 1 < size && (src[i + 1] & 0x80) != 0x80) {
|
||||
i++;
|
||||
goto retry_search;
|
||||
}
|
||||
|
||||
if (unlikely(org == 0)) {
|
||||
if (i == size)
|
||||
return Qnil;
|
||||
|
||||
rb_out = rb_str_buf_new(size * 4 / 3);
|
||||
rb_enc_associate(rb_out, rb_utf8_encoding());
|
||||
|
||||
rb_bytes = rb_str_buf_new(16);
|
||||
rb_enc_associate(rb_bytes, rb_utf8_encoding());
|
||||
}
|
||||
|
||||
if (i > org)
|
||||
rb_str_buf_cat(rb_out, (const char *)src + org, i - org);
|
||||
|
||||
if (unlikely(i == size))
|
||||
break;
|
||||
|
||||
emoji_len = size - i;
|
||||
if (emoji_len > 12)
|
||||
emoji_len = 12;
|
||||
|
||||
memcpy(RSTRING_PTR(rb_bytes), src + i, emoji_len);
|
||||
rb_str_set_len(rb_bytes, emoji_len);
|
||||
|
||||
emoji_len = lookup_emoji(&rb_emoji, rb_bytes, rb_unicode_map, emoji_byte_lengths[(int)emoji_byte]);
|
||||
|
||||
if (emoji_len) {
|
||||
VALUE rb_repl = rb_yield(rb_emoji);
|
||||
|
||||
if (NIL_P(rb_repl)) {
|
||||
rb_str_buf_cat(rb_out, (const char *)src + i, emoji_len);
|
||||
} else {
|
||||
Check_Type(rb_repl, T_STRING);
|
||||
rb_str_buf_append(rb_out, rb_repl);
|
||||
}
|
||||
|
||||
i += emoji_len;
|
||||
continue;
|
||||
}
|
||||
|
||||
rb_str_buf_cat(rb_out, (const char *)src + i, 1);
|
||||
i++;
|
||||
}
|
||||
|
||||
return rb_out;
|
||||
}
|
||||
|
||||
static VALUE
|
||||
rb_gemoji_replace_unicode(VALUE klass, VALUE rb_source)
|
||||
{
|
||||
VALUE rb_output;
|
||||
VALUE rb_unicode_map = rb_funcall(klass, rb_intern("unicodes_index"), 0);
|
||||
|
||||
Check_Type(rb_source, T_STRING);
|
||||
Check_Type(rb_unicode_map, T_HASH);
|
||||
|
||||
rb_must_asciicompat(rb_source);
|
||||
|
||||
if (ENC_CODERANGE_ASCIIONLY(rb_source))
|
||||
return rb_source;
|
||||
|
||||
if (rb_enc_get(rb_source) != rb_utf8_encoding())
|
||||
rb_raise(rb_eEncCompatError, "expected UTF-8 encoding");
|
||||
|
||||
rb_output = replace_emoji((uint8_t *)RSTRING_PTR(rb_source), RSTRING_LEN(rb_source), rb_unicode_map);
|
||||
if (NIL_P(rb_output))
|
||||
return rb_source;
|
||||
|
||||
return rb_output;
|
||||
}
|
||||
|
||||
void Init_gemoji(void)
|
||||
{
|
||||
VALUE rb_mEmoji = rb_define_module("Emoji");
|
||||
rb_define_method(rb_mEmoji, "gsub_unicode", rb_gemoji_replace_unicode, 1);
|
||||
}
|
||||
@@ -1,5 +1,6 @@
|
||||
require 'emoji/character'
|
||||
require 'json'
|
||||
require 'gemoji.so'
|
||||
|
||||
module Emoji
|
||||
extend self
|
||||
|
||||
43
lib/emoji/tables.rb
Normal file
43
lib/emoji/tables.rb
Normal file
@@ -0,0 +1,43 @@
|
||||
require 'emoji'
|
||||
|
||||
module Emoji
|
||||
module Tables
|
||||
def self.all_byte_sequences
|
||||
@all_byte_sequences ||= begin
|
||||
all = Emoji.all.flat_map { |e| e.unicode_aliases }.compact
|
||||
all.map { |e| e.bytes }
|
||||
end
|
||||
end
|
||||
|
||||
def self.generate_length_tables
|
||||
groups = all_byte_sequences.group_by { |seq| seq.first }
|
||||
|
||||
groups.each do |k, v|
|
||||
v.map! { |seq| seq.size }
|
||||
v.uniq!
|
||||
v.sort!
|
||||
v.reverse!
|
||||
end
|
||||
|
||||
groups = groups.reduce({}) { |h, (k,v)| (h[v] ||= []) << k; h}
|
||||
byte_array = Array.new(256) { 0 }
|
||||
tags_width = groups.keys.map { |k| k.size }.max + 1
|
||||
|
||||
code = "static const long emoji_byte_lengths[][#{tags_width}] = {\n"
|
||||
code << "\t{0},\n"
|
||||
|
||||
groups.each_with_index do |(len_tags, magic_bytes), idx|
|
||||
code << "\t{" + (len_tags + [0]).join(', ') + "},\n"
|
||||
|
||||
magic_bytes.each do |b|
|
||||
byte_array[b] = idx + 1
|
||||
end
|
||||
end
|
||||
|
||||
code << "};\n\n"
|
||||
code << "static const int8_t emoji_magic_bytes[] =\n"
|
||||
code << "{" + byte_array.map(&:to_s).join(', ') + "};\n"
|
||||
code
|
||||
end
|
||||
end
|
||||
end
|
||||
15
lib/tasks/chelpers.rake
Normal file
15
lib/tasks/chelpers.rake
Normal file
@@ -0,0 +1,15 @@
|
||||
require 'json'
|
||||
|
||||
desc "generate helpers for the C extension"
|
||||
task :c_helpers do
|
||||
emoji_file = "#{Rake.original_dir}/db/emoji.json"
|
||||
emoji = JSON.parse(File.read(emoji_file))
|
||||
|
||||
all_emojis = emoji.map { |e| e['emoji'] }.compact
|
||||
all_emojis.map! { |e| e.bytes }
|
||||
|
||||
magic_bytes = Array(256) { 0 }
|
||||
all_emojis.each { |bytes| magic_bytes[bytes.first] = 1 }
|
||||
|
||||
puts magic_bytes.inspect
|
||||
end
|
||||
42
script/benchmark
Normal file
42
script/benchmark
Normal file
@@ -0,0 +1,42 @@
|
||||
#!/usr/bin/env ruby
|
||||
|
||||
require 'emoji'
|
||||
require 'benchmark'
|
||||
|
||||
# Build a regexp that matches all native emoji characters.
|
||||
# Some emoji code point sequences are prefixes of other emoji code point
|
||||
# sequences, e.g.:
|
||||
# U+2728 SPARKLES
|
||||
# vs.
|
||||
# U+2728 SPARKLES U+FE0F VARIATION SELECTOR-16
|
||||
# We sort the code point sequences longest-first so that the regex will
|
||||
# match the longest possible sequence.
|
||||
def unicodes_pattern
|
||||
$unicodes_pattern ||= Regexp.new(emoji_unicodes.sort_by(&:length).reverse.join("|"))
|
||||
end
|
||||
|
||||
def emoji_unicodes
|
||||
Emoji.all.flat_map(&:unicode_aliases)
|
||||
end
|
||||
|
||||
def unicode_emoji_filter(text)
|
||||
text.gsub(unicodes_pattern) do |unicode|
|
||||
emoji = Emoji.find_by_unicode(unicode)
|
||||
"<g-emoji alias='#{emoji.name}'>"
|
||||
end
|
||||
end
|
||||
|
||||
def gsub_unicode(text)
|
||||
Emoji.gsub_unicode(text) do |emoji|
|
||||
"<g-emoji alias='#{emoji.name}'>"
|
||||
end
|
||||
end
|
||||
|
||||
data_file = File.join(File.dirname(File.realpath(__FILE__)), '../db/emoji.json')
|
||||
raw = File.open(data_file, 'r:UTF-8') { |file| file.read }
|
||||
raw = raw * 100
|
||||
|
||||
puts "Benchmarking #{raw.bytesize} bytes..."
|
||||
|
||||
puts Benchmark.measure('unicode_emoji_filter') { unicode_emoji_filter(raw) }
|
||||
puts Benchmark.measure('Emoji#gsub_unicode') { gsub_unicode(raw) }
|
||||
31
test/gsub_test.rb
Normal file
31
test/gsub_test.rb
Normal file
@@ -0,0 +1,31 @@
|
||||
require_relative './test_helper'
|
||||
|
||||
class GsubTest < TestCase
|
||||
|
||||
def random_letters
|
||||
@random_letters ||= ('a'..'z').to_a + ('A'..'Z').to_a + ('0'..'9').to_a + [' '] * 10 + ['%', '-']
|
||||
end
|
||||
|
||||
def random_string(n)
|
||||
random_letters.sample(n).join
|
||||
end
|
||||
|
||||
def all_emojis
|
||||
@all_emojis ||= Emoji.all.flat_map { |e| e.unicode_aliases }.compact
|
||||
end
|
||||
|
||||
def test_replace_fuzz_testing
|
||||
emoji = all_emojis.shuffle
|
||||
fuzz = ""
|
||||
expected = ""
|
||||
|
||||
emoji.each do |emoji|
|
||||
rnd = random_string(rand(20))
|
||||
fuzz << rnd << emoji
|
||||
expected << rnd << ":#{Emoji.find_by_unicode(emoji).name}:"
|
||||
end
|
||||
|
||||
result = Emoji.gsub_unicode(fuzz) { |emoji| ":#{emoji.name}:" }
|
||||
assert_equal expected, result
|
||||
end
|
||||
end
|
||||
Reference in New Issue
Block a user