/ Published in: Ruby
Convert Vietnamese characters into ASCII so they can be indexed and searched.
Expand |
Embed | Plain Text
require 'unicode' # Normalizes token text to lower case. class UnicodeLowerCaseFilter def initialize(token_stream) @input = token_stream end def text=(text) @input.text = text end def next() t = @input.next() if (t == nil) return nil end t.text = Unicode.downcase(t.text) return t end end class VietnameseAnalyzer < Ferret::Analysis::Analyzer include Ferret::Analysis # Standard Character mappings to remove all special characters # so only default ASCII characters get indexed CHARACTER_MAPPINGS = { ['á','à','ạ','ả','ã','ă','ắ','ằ','ặ','ẳ','ẵ','â','ấ','ầ','ậ','ẩ','ẫ'] => 'a', ['đ'] => 'd', ['é','è','ẹ','ẻ','ẽ','ê','ế','ề','ệ','ể','ễ'] => 'e', ['í','ì','ị','ỉ','ĩ'] => 'i', ['ó','ò','ọ','ủ','õ','ơ','ớ','ờ','ợ','ở','ỡ','ô','ố','ồ','ộ','ổ','ỗ'] => 'o', ['ú','ù','ụ','ů','ũ','ư','ứ','ừ','ự','ử','ữ'] => 'u', ['ý','ỳ','ỵ','ỷ','ỹ'] => 'y', } unless defined?(CHARACTER_MAPPINGS) def token_stream(field, str) ts = StandardTokenizer.new(str) ts = UnicodeLowerCaseFilter.new(ts) ts = MappingFilter.new(ts, CHARACTER_MAPPINGS) end end
You need to login to post a comment.
