module UTF8Util

Constants

HIGH_BIT_RANGE
REPLACEMENT_CHAR

use ‘?’ intsead of the unicode replace char, since that is 3 bytes and can increase the string size if it’s done a lot

Public Class Methods

clean(str) click to toggle source

Replace invalid UTF-8 character sequences with a replacement character

Returns a copy of this String as valid UTF-8.

# File lib/resque/vendor/utf8_util.rb, line 16
def self.clean(str)
  clean!(str.dup)
end
clean!(str) click to toggle source

Replace invalid UTF-8 character sequences with a replacement character

Returns self as valid UTF-8.

# File lib/resque/vendor/utf8_util.rb, line 9
def self.clean!(str)
  raise NotImplementedError
end
sequence_length(scanner) click to toggle source

Validate the UTF-8 sequence at the current scanner position.

scanner - StringScanner instance so we can advance the pointer as we verify.

Returns The length in bytes of this UTF-8 sequence, false if invalid.

# File lib/resque/vendor/utf8_util/utf8_util_18.rb, line 44
def self.sequence_length(scanner)
  leader = scanner.get_byte[0]

  if (leader >> 5) == 0x6
    if check_next_sequence(scanner)
      return 2
    else
      scanner.pos -= 1
    end
  elsif (leader >> 4) == 0x0e
    if check_next_sequence(scanner)
      if check_next_sequence(scanner)
        return 3
      else
        scanner.pos -= 2
      end
    else
      scanner.pos -= 1
    end
  elsif (leader >> 3) == 0x1e
    if check_next_sequence(scanner)
      if check_next_sequence(scanner)
        if check_next_sequence(scanner)
          return 4
        else
          scanner.pos -= 3
        end
      else
        scanner.pos -= 2
      end
    else
      scanner.pos -= 1
    end
  end

  false
end
valid?(str) click to toggle source

Check if this String is valid UTF-8

Returns true or false.

# File lib/resque/vendor/utf8_util/utf8_util_18.rb, line 9
def self.valid?(str)
  sc = StringScanner.new(str)

  while sc.skip_until(HIGH_BIT_RANGE)
    sc.pos -= 1

    if !sequence_length(sc)
      return false
    end
  end

  true
end