| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165 |
- # lib/http.rb
- require "net/http"
- require "uri"
- require "openssl"
- require "open3"
- module Http
- BASE_HEADERS = {
- "User-Agent" => "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
- "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
- "Accept-Language" => "en-AU,en;q=0.9",
- # identity avoids manual gzip handling; servers still respond correctly
- "Accept-Encoding" => "identity",
- "Connection" => "keep-alive",
- # Browser fingerprint headers — modern WAFs check these to distinguish
- # real browsers from scripts. Omitting them triggers 403/challenge pages.
- "Upgrade-Insecure-Requests" => "1",
- "Sec-Fetch-Dest" => "document",
- "Sec-Fetch-Mode" => "navigate",
- "Sec-Fetch-Site" => "none",
- "Sec-Fetch-User" => "?1",
- "sec-ch-ua" => '"Chromium";v="127", "Not)A;Brand";v="99", "Google Chrome";v="127"',
- "sec-ch-ua-mobile" => "?0",
- "sec-ch-ua-platform" => '"Windows"',
- }.freeze
- def self.merge_set_cookie!(jar, response)
- Array(response.get_fields("set-cookie")).each do |raw|
- raw.split(/,\s*(?=[^;]+=[^;]+)/).each do |cookie|
- pair = cookie.split(";", 2).first
- k, v = pair.split("=", 2)
- next if k.nil? || v.nil?
- jar[k.strip] = v.strip
- end
- end
- end
- def self.cookie_header(jar)
- return nil if jar.empty?
- jar.map { |k, v| "#{k}=#{v}" }.join("; ")
- end
- def self.request(uri, headers: {}, jar: {}, referer: nil)
- http = Net::HTTP.new(uri.host, uri.port)
- http.use_ssl = uri.scheme == "https"
- http.verify_mode = (ENV["ALLOW_INSECURE"] == "1") ? OpenSSL::SSL::VERIFY_NONE : OpenSSL::SSL::VERIFY_PEER
- http.read_timeout = 30
- http.open_timeout = 15
- http.keep_alive_timeout = 10
- h = BASE_HEADERS.merge(headers)
- h["Referer"] = referer if referer
- ck = cookie_header(jar)
- h["Cookie"] = ck if ck
- req = Net::HTTP::Get.new(uri.request_uri, h)
- # leave decode_content default to let Net::HTTP handle it
- http.start do |hcli|
- res = hcli.request(req)
- merge_set_cookie!(jar, res)
- res
- end
- ensure
- http.finish if http&.started?
- end
- # Generic GET with retries, cookie jar persistence, and 403 warmup + curl fallback
- def self.get(url, headers: {}, tries: 4, referer: nil)
- uri = URI.parse(url)
- jar = {}
- warmed = false
- attempts = 0
- loop do
- ref = referer || "#{uri.scheme}://#{uri.host}/"
- begin
- res = request(uri, headers: headers, jar: jar, referer: ref)
- case res
- when Net::HTTPRedirection
- loc = res["location"] or raise "redirect without location"
- uri = URI.join(uri.to_s, loc)
- next # follow redirect immediately with same jar
- when Net::HTTPSuccess
- return res.body
- else
- code = res.code.to_i
- if [403, 406].include?(code) && !warmed
- # warm up same-site, then try again once
- begin
- request(URI.parse(ref), headers: headers, jar: jar, referer: ref)
- rescue OpenSSL::SSL::SSLError, EOFError, Errno::ECONNRESET, Net::ReadTimeout, Net::OpenTimeout
- end
- warmed = true
- next
- end
- if [403, 406].include?(code)
- # final curl fallback — use array form to avoid shell injection
- out, = Open3.capture2(
- "curl", "-sSL", "--compressed",
- "-A", BASE_HEADERS["User-Agent"],
- "-H", "Accept: #{BASE_HEADERS["Accept"]}",
- "-H", "Accept-Language: #{BASE_HEADERS["Accept-Language"]}",
- "-H", "Upgrade-Insecure-Requests: 1",
- "-H", "Sec-Fetch-Dest: document",
- "-H", "Sec-Fetch-Mode: navigate",
- "-H", "Sec-Fetch-Site: same-origin",
- "-H", "Sec-Fetch-User: ?1",
- "-H", "sec-ch-ua: #{BASE_HEADERS["sec-ch-ua"]}",
- "-H", "sec-ch-ua-mobile: ?0",
- "-H", "sec-ch-ua-platform: #{BASE_HEADERS["sec-ch-ua-platform"]}",
- "-e", ref,
- uri.to_s
- )
- return out unless out.to_s.strip.empty?
- end
- raise "#{res.code} #{res.message}"
- end
- rescue OpenSSL::SSL::SSLError, EOFError, Errno::ECONNRESET, Net::ReadTimeout, Net::OpenTimeout => e
- attempts += 1
- raise e if attempts >= tries
- sleep(2**attempts)
- next
- end
- end
- end
- # Dorset eServices: tolerant warm-up and HTTPS→HTTP fallback
- def self.dorset_session_get(target_url)
- tgt_uri = URI.parse(target_url)
- host = tgt_uri.host
- https_base = "https://#{host}"
- http_base = "http://#{host}"
- warm_candidates = ["/", "/eservice/"]
- [https_base, http_base].each do |base|
- jar = {}
- begin
- warm_candidates.each do |p|
- begin
- request(URI.parse("#{base}#{p}"), headers: {}, jar: jar, referer: "#{base}/")
- rescue OpenSSL::SSL::SSLError, EOFError, Errno::ECONNRESET, Net::ReadTimeout, Net::OpenTimeout
- end
- end
- tgt = URI.parse(target_url.sub(%r{\Ahttps?://[^/]+}, base))
- res = request(tgt, headers: {}, jar: jar, referer: "#{base}/eservice/")
- if res.is_a?(Net::HTTPRedirection) && res["location"]
- res = request(URI.join(tgt.to_s, res["location"]), headers: {}, jar: jar, referer: "#{base}/eservice/")
- end
- return res.body if res.is_a?(Net::HTTPSuccess)
- rescue OpenSSL::SSL::SSLError, EOFError, Errno::ECONNRESET, Net::ReadTimeout, Net::OpenTimeout
- next
- end
- end
- raise "Dorset fetch failed via HTTPS and HTTP"
- end
- end
|