module Addressable::IDNA

Constants

ACE_MAX_LENGTH
ACE_PREFIX
COMPOSITION_TABLE
HANGUL_LBASE
HANGUL_LCOUNT
HANGUL_NCOUNT
HANGUL_SBASE
HANGUL_SCOUNT
HANGUL_TBASE
HANGUL_TCOUNT
HANGUL_VBASE
HANGUL_VCOUNT
PUNYCODE_BASE
PUNYCODE_DAMP
PUNYCODE_DELIMITER
PUNYCODE_INITIAL_BIAS
PUNYCODE_INITIAL_N
PUNYCODE_MAXINT
PUNYCODE_PRINT_ASCII
PUNYCODE_SKEW
PUNYCODE_TMAX
PUNYCODE_TMIN
UNICODE_DATA

This is a sparse Unicode table. Codepoints without entries are assumed to have the value: [0, 0, nil, nil, nil, nil, nil]

UNICODE_DATA_CANONICAL
UNICODE_DATA_COMBINING_CLASS
UNICODE_DATA_COMPATIBILITY
UNICODE_DATA_EXCLUSION
UNICODE_DATA_LOWERCASE
UNICODE_DATA_TITLECASE
UNICODE_DATA_UPPERCASE
UNICODE_MAX_LENGTH
UNICODE_TABLE

This module is loosely based on idn_actionmailer by Mick Staugaard, the unicode library by Yoshida Masato, and the punycode implementation by Kazuhiro Nishiyama. Most of the code was copied verbatim, but some reformatting was done, and some translation from C was done.

Without their code to work from as a base, we'd all still be relying on the presence of libidn. Which nobody ever seems to have installed.

Original sources: github.com/staugaard/idn_actionmailer www.yoshidam.net/Ruby.html#unicode rubyforge.org/frs/?group_id=2550

UTF8_REGEX
UTF8_REGEX_MULTIBYTE

Public Class Methods

to_ascii(value) click to toggle source
# File lib/addressable/idna/native.rb, line 35
def self.to_ascii(value)
  value.to_s.split('.', -1).map do |segment|
    if segment.size > 0 && segment.size < 64
      IDN::Idna.toASCII(segment, IDN::Idna::ALLOW_UNASSIGNED)
    elsif segment.size >= 64
      segment
    else
      ''
    end
  end.join('.')
end
to_unicode(value) click to toggle source
# File lib/addressable/idna/native.rb, line 47
def self.to_unicode(value)
  value.to_s.split('.', -1).map do |segment|
    if segment.size > 0 && segment.size < 64
      IDN::Idna.toUnicode(segment, IDN::Idna::ALLOW_UNASSIGNED)
    elsif segment.size >= 64
      segment
    else
      ''
    end
  end.join('.')
end
unicode_normalize_kc(value) click to toggle source
# File lib/addressable/idna/native.rb, line 31
def self.unicode_normalize_kc(value)
  IDN::Stringprep.nfkc_normalize(value.to_s)
end

Private Class Methods

lookup_unicode_combining_class(codepoint) click to toggle source
# File lib/addressable/idna/pure.rb, line 281
def self.lookup_unicode_combining_class(codepoint)
  codepoint_data = UNICODE_DATA[codepoint]
  (codepoint_data ?
    (codepoint_data[UNICODE_DATA_COMBINING_CLASS] || 0) :
    0)
end
lookup_unicode_compatibility(codepoint) click to toggle source
# File lib/addressable/idna/pure.rb, line 289
def self.lookup_unicode_compatibility(codepoint)
  codepoint_data = UNICODE_DATA[codepoint]
  (codepoint_data ?
    codepoint_data[UNICODE_DATA_COMPATIBILITY] : nil)
end
lookup_unicode_composition(unpacked) click to toggle source
# File lib/addressable/idna/pure.rb, line 304
def self.lookup_unicode_composition(unpacked)
  return COMPOSITION_TABLE[unpacked]
end
lookup_unicode_lowercase(codepoint) click to toggle source
# File lib/addressable/idna/pure.rb, line 296
def self.lookup_unicode_lowercase(codepoint)
  codepoint_data = UNICODE_DATA[codepoint]
  (codepoint_data ?
    (codepoint_data[UNICODE_DATA_LOWERCASE] || codepoint) :
    codepoint)
end
punycode_adapt(delta, numpoints, firsttime) click to toggle source

Bias adaptation method

# File lib/addressable/idna/pure.rb, line 659
def self.punycode_adapt(delta, numpoints, firsttime)
  delta = firsttime ? delta / PUNYCODE_DAMP : delta >> 1
  # delta >> 1 is a faster way of doing delta / 2
  delta += delta / numpoints
  difference = PUNYCODE_BASE - PUNYCODE_TMIN

  k = 0
  while delta > (difference * PUNYCODE_TMAX) / 2
    delta /= difference
    k += PUNYCODE_BASE
  end

  k + (difference + 1) * delta / (delta + PUNYCODE_SKEW)
end
punycode_basic?(codepoint) click to toggle source
# File lib/addressable/idna/pure.rb, line 627
def self.punycode_basic?(codepoint)
  codepoint < 0x80
end
punycode_decode(value) click to toggle source
# File lib/addressable/idna/native.rb, line 27
def self.punycode_decode(value)
  IDN::Punycode.decode(value.to_s)
end
punycode_decode_digit(codepoint) click to toggle source

Returns the numeric value of a basic codepoint (for use in representing integers) in the range 0 to base - 1, or PUNYCODE_BASE if codepoint does not represent a value.

# File lib/addressable/idna/pure.rb, line 645
def self.punycode_decode_digit(codepoint)
  if codepoint - 48 < 10
    codepoint - 22
  elsif codepoint - 65 < 26
    codepoint - 65
  elsif codepoint - 97 < 26
    codepoint - 97
  else
    PUNYCODE_BASE
  end
end
punycode_delimiter?(codepoint) click to toggle source
# File lib/addressable/idna/pure.rb, line 632
def self.punycode_delimiter?(codepoint)
  codepoint == PUNYCODE_DELIMITER
end
punycode_encode(value) click to toggle source
# File lib/addressable/idna/native.rb, line 23
def self.punycode_encode(value)
  IDN::Punycode.encode(value.to_s)
end
punycode_encode_digit(d) click to toggle source
# File lib/addressable/idna/pure.rb, line 637
def self.punycode_encode_digit(d)
  d + 22 + 75 * ((d < 26) ? 1 : 0)
end
ucs4_to_utf8(char, buffer) click to toggle source
# File lib/addressable/idna/pure.rb, line 187
def self.ucs4_to_utf8(char, buffer)
  if char < 128
    buffer << char
  elsif char < 2048
    buffer << (char >> 6 | 192)
    buffer << (char & 63 | 128)
  elsif char < 0x10000
    buffer << (char >> 12 | 224)
    buffer << (char >> 6 & 63 | 128)
    buffer << (char & 63 | 128)
  elsif char < 0x200000
    buffer << (char >> 18 | 240)
    buffer << (char >> 12 & 63 | 128)
    buffer << (char >> 6 & 63 | 128)
    buffer << (char & 63 | 128)
  elsif char < 0x4000000
    buffer << (char >> 24 | 248)
    buffer << (char >> 18 & 63 | 128)
    buffer << (char >> 12 & 63 | 128)
    buffer << (char >> 6 & 63 | 128)
    buffer << (char & 63 | 128)
  elsif char < 0x80000000
    buffer << (char >> 30 | 252)
    buffer << (char >> 24 & 63 | 128)
    buffer << (char >> 18 & 63 | 128)
    buffer << (char >> 12 & 63 | 128)
    buffer << (char >> 6 & 63 | 128)
    buffer << (char & 63 | 128)
  end
end
unicode_compose(unpacked) click to toggle source
# File lib/addressable/idna/pure.rb, line 138
def self.unicode_compose(unpacked)
  unpacked_result = []
  length = unpacked.length

  return unpacked if length == 0

  starter = unpacked[0]
  starter_cc = lookup_unicode_combining_class(starter)
  starter_cc = 256 if starter_cc != 0
  for i in 1...length
    ch = unpacked[i]

    if (starter_cc == 0 &&
        (composite = unicode_compose_pair(starter, ch)) != nil)
      starter = composite
    else
      unpacked_result << starter
      starter = ch
    end
  end
  unpacked_result << starter
  return unpacked_result
end
unicode_compose_pair(ch_one, ch_two) click to toggle source
# File lib/addressable/idna/pure.rb, line 163
def self.unicode_compose_pair(ch_one, ch_two)
  if ch_one >= HANGUL_LBASE && ch_one < HANGUL_LBASE + HANGUL_LCOUNT &&
      ch_two >= HANGUL_VBASE && ch_two < HANGUL_VBASE + HANGUL_VCOUNT
    # Hangul L + V
    return HANGUL_SBASE + (
      (ch_one - HANGUL_LBASE) * HANGUL_VCOUNT + (ch_two - HANGUL_VBASE)
    ) * HANGUL_TCOUNT
  elsif ch_one >= HANGUL_SBASE &&
      ch_one < HANGUL_SBASE + HANGUL_SCOUNT &&
      (ch_one - HANGUL_SBASE) % HANGUL_TCOUNT == 0 &&
      ch_two >= HANGUL_TBASE && ch_two < HANGUL_TBASE + HANGUL_TCOUNT
       # Hangul LV + T
    return ch_one + (ch_two - HANGUL_TBASE)
  end

  p = []

  ucs4_to_utf8(ch_one, p)
  ucs4_to_utf8(ch_two, p)

  return lookup_unicode_composition(p)
end
unicode_decompose(unpacked) click to toggle source
# File lib/addressable/idna/pure.rb, line 243
def self.unicode_decompose(unpacked)
  unpacked_result = []
  for cp in unpacked
    if cp >= HANGUL_SBASE && cp < HANGUL_SBASE + HANGUL_SCOUNT
      l, v, t = unicode_decompose_hangul(cp)
      unpacked_result << l
      unpacked_result << v if v
      unpacked_result << t if t
    else
      dc = lookup_unicode_compatibility(cp)
      unless dc
        unpacked_result << cp
      else
        unpacked_result.concat(unicode_decompose(dc.unpack("U*")))
      end
    end
  end
  return unpacked_result
end
unicode_decompose_hangul(codepoint) click to toggle source
# File lib/addressable/idna/pure.rb, line 264
def self.unicode_decompose_hangul(codepoint)
  sindex = codepoint - HANGUL_SBASE;
  if sindex < 0 || sindex >= HANGUL_SCOUNT
    l = codepoint
    v = t = nil
    return l, v, t
  end
  l = HANGUL_LBASE + sindex / HANGUL_NCOUNT
  v = HANGUL_VBASE + (sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT
  t = HANGUL_TBASE + sindex % HANGUL_TCOUNT
  if t == HANGUL_TBASE
    t = nil
  end
  return l, v, t
end
unicode_downcase(input) click to toggle source

Unicode aware downcase method.

@api private @param [String] input

The input string.

@return [String] The downcased result.

# File lib/addressable/idna/pure.rb, line 130
def self.unicode_downcase(input)
  input = input.to_s unless input.is_a?(String)
  unpacked = input.unpack("U*")
  unpacked.map! { |codepoint| lookup_unicode_lowercase(codepoint) }
  return unpacked.pack("U*")
end
unicode_sort_canonical(unpacked) click to toggle source
# File lib/addressable/idna/pure.rb, line 219
def self.unicode_sort_canonical(unpacked)
  unpacked = unpacked.dup
  i = 1
  length = unpacked.length

  return unpacked if length < 2

  while i < length
    last = unpacked[i-1]
    ch = unpacked[i]
    last_cc = lookup_unicode_combining_class(last)
    cc = lookup_unicode_combining_class(ch)
    if cc != 0 && last_cc != 0 && last_cc > cc
      unpacked[i] = last
      unpacked[i-1] = ch
      i -= 1 if i > 1
    else
      i += 1
    end
  end
  return unpacked
end