http.rb 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165
  1. # lib/http.rb
  2. require "net/http"
  3. require "uri"
  4. require "openssl"
  5. require "open3"
  6. module Http
  7. BASE_HEADERS = {
  8. "User-Agent" => "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
  9. "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
  10. "Accept-Language" => "en-AU,en;q=0.9",
  11. # identity avoids manual gzip handling; servers still respond correctly
  12. "Accept-Encoding" => "identity",
  13. "Connection" => "keep-alive",
  14. # Browser fingerprint headers — modern WAFs check these to distinguish
  15. # real browsers from scripts. Omitting them triggers 403/challenge pages.
  16. "Upgrade-Insecure-Requests" => "1",
  17. "Sec-Fetch-Dest" => "document",
  18. "Sec-Fetch-Mode" => "navigate",
  19. "Sec-Fetch-Site" => "none",
  20. "Sec-Fetch-User" => "?1",
  21. "sec-ch-ua" => '"Chromium";v="127", "Not)A;Brand";v="99", "Google Chrome";v="127"',
  22. "sec-ch-ua-mobile" => "?0",
  23. "sec-ch-ua-platform" => '"Windows"',
  24. }.freeze
  25. def self.merge_set_cookie!(jar, response)
  26. Array(response.get_fields("set-cookie")).each do |raw|
  27. raw.split(/,\s*(?=[^;]+=[^;]+)/).each do |cookie|
  28. pair = cookie.split(";", 2).first
  29. k, v = pair.split("=", 2)
  30. next if k.nil? || v.nil?
  31. jar[k.strip] = v.strip
  32. end
  33. end
  34. end
  35. def self.cookie_header(jar)
  36. return nil if jar.empty?
  37. jar.map { |k, v| "#{k}=#{v}" }.join("; ")
  38. end
  39. def self.request(uri, headers: {}, jar: {}, referer: nil)
  40. http = Net::HTTP.new(uri.host, uri.port)
  41. http.use_ssl = uri.scheme == "https"
  42. http.verify_mode = (ENV["ALLOW_INSECURE"] == "1") ? OpenSSL::SSL::VERIFY_NONE : OpenSSL::SSL::VERIFY_PEER
  43. http.read_timeout = 30
  44. http.open_timeout = 15
  45. http.keep_alive_timeout = 10
  46. h = BASE_HEADERS.merge(headers)
  47. h["Referer"] = referer if referer
  48. ck = cookie_header(jar)
  49. h["Cookie"] = ck if ck
  50. req = Net::HTTP::Get.new(uri.request_uri, h)
  51. # leave decode_content default to let Net::HTTP handle it
  52. http.start do |hcli|
  53. res = hcli.request(req)
  54. merge_set_cookie!(jar, res)
  55. res
  56. end
  57. ensure
  58. http.finish if http&.started?
  59. end
  60. # Generic GET with retries, cookie jar persistence, and 403 warmup + curl fallback
  61. def self.get(url, headers: {}, tries: 4, referer: nil)
  62. uri = URI.parse(url)
  63. jar = {}
  64. warmed = false
  65. attempts = 0
  66. loop do
  67. ref = referer || "#{uri.scheme}://#{uri.host}/"
  68. begin
  69. res = request(uri, headers: headers, jar: jar, referer: ref)
  70. case res
  71. when Net::HTTPRedirection
  72. loc = res["location"] or raise "redirect without location"
  73. uri = URI.join(uri.to_s, loc)
  74. next # follow redirect immediately with same jar
  75. when Net::HTTPSuccess
  76. return res.body
  77. else
  78. code = res.code.to_i
  79. if [403, 406].include?(code) && !warmed
  80. # warm up same-site, then try again once
  81. begin
  82. request(URI.parse(ref), headers: headers, jar: jar, referer: ref)
  83. rescue OpenSSL::SSL::SSLError, EOFError, Errno::ECONNRESET, Net::ReadTimeout, Net::OpenTimeout
  84. end
  85. warmed = true
  86. next
  87. end
  88. if [403, 406].include?(code)
  89. # final curl fallback — use array form to avoid shell injection
  90. out, = Open3.capture2(
  91. "curl", "-sSL", "--compressed",
  92. "-A", BASE_HEADERS["User-Agent"],
  93. "-H", "Accept: #{BASE_HEADERS["Accept"]}",
  94. "-H", "Accept-Language: #{BASE_HEADERS["Accept-Language"]}",
  95. "-H", "Upgrade-Insecure-Requests: 1",
  96. "-H", "Sec-Fetch-Dest: document",
  97. "-H", "Sec-Fetch-Mode: navigate",
  98. "-H", "Sec-Fetch-Site: same-origin",
  99. "-H", "Sec-Fetch-User: ?1",
  100. "-H", "sec-ch-ua: #{BASE_HEADERS["sec-ch-ua"]}",
  101. "-H", "sec-ch-ua-mobile: ?0",
  102. "-H", "sec-ch-ua-platform: #{BASE_HEADERS["sec-ch-ua-platform"]}",
  103. "-e", ref,
  104. uri.to_s
  105. )
  106. return out unless out.to_s.strip.empty?
  107. end
  108. raise "#{res.code} #{res.message}"
  109. end
  110. rescue OpenSSL::SSL::SSLError, EOFError, Errno::ECONNRESET, Net::ReadTimeout, Net::OpenTimeout => e
  111. attempts += 1
  112. raise e if attempts >= tries
  113. sleep(2**attempts)
  114. next
  115. end
  116. end
  117. end
  118. # Dorset eServices: tolerant warm-up and HTTPS→HTTP fallback
  119. def self.dorset_session_get(target_url)
  120. tgt_uri = URI.parse(target_url)
  121. host = tgt_uri.host
  122. https_base = "https://#{host}"
  123. http_base = "http://#{host}"
  124. warm_candidates = ["/", "/eservice/"]
  125. [https_base, http_base].each do |base|
  126. jar = {}
  127. begin
  128. warm_candidates.each do |p|
  129. begin
  130. request(URI.parse("#{base}#{p}"), headers: {}, jar: jar, referer: "#{base}/")
  131. rescue OpenSSL::SSL::SSLError, EOFError, Errno::ECONNRESET, Net::ReadTimeout, Net::OpenTimeout
  132. end
  133. end
  134. tgt = URI.parse(target_url.sub(%r{\Ahttps?://[^/]+}, base))
  135. res = request(tgt, headers: {}, jar: jar, referer: "#{base}/eservice/")
  136. if res.is_a?(Net::HTTPRedirection) && res["location"]
  137. res = request(URI.join(tgt.to_s, res["location"]), headers: {}, jar: jar, referer: "#{base}/eservice/")
  138. end
  139. return res.body if res.is_a?(Net::HTTPSuccess)
  140. rescue OpenSSL::SSL::SSLError, EOFError, Errno::ECONNRESET, Net::ReadTimeout, Net::OpenTimeout
  141. next
  142. end
  143. end
  144. raise "Dorset fetch failed via HTTPS and HTTP"
  145. end
  146. end