planbuild.rb 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180
  1. # scrapers/planbuild.rb
  2. require "date"
  3. require "json"
  4. require "nokogiri"
  5. require "uri"
  6. require "net/http"
  7. require "zlib"
  8. require "stringio"
  9. require "fileutils"
  10. require_relative "../lib/http"
  11. require_relative "../lib/db"
  12. require_relative "../lib/log"
  13. require_relative "../lib/util"
  14. require_relative "../lib/geocode"
  15. require_relative "../lib/enrich"
  16. TABLE = ENV.fetch("TABLE_NAME")
  17. BASE = "https://portal.planbuild.tas.gov.au"
  18. PAGE = "#{BASE}/external/advertisement/search"
  19. DOWNLOAD_ATTACHMENTS = ENV["DOWNLOAD_ATTACHMENTS"] == "1"
  20. DOWNLOAD_DIR = ENV["DOWNLOAD_DIR"] || "/app/downloads"
  21. DB.ensure_table!(TABLE)
  22. # --- cookie + csrf helpers ---
  23. def merge_set_cookie!(jar, res)
  24. (res.get_fields("set-cookie") || []).each do |raw|
  25. raw.split(/,(?=[^;]+?=)/).each do |c|
  26. if c =~ /\A\s*([^=;,\s]+)\s*=\s*([^;,\s]+)/
  27. jar[$1] = $2
  28. end
  29. end
  30. end
  31. end
  32. def cookie_header(jar)
  33. base = "accepted=1; disclaimerAccepted=true; insecureSiteWideBanner=1"
  34. more = jar.map { |k, v| "#{k}=#{v}" }.join("; ")
  35. [base, more].reject(&:empty?).join("; ")
  36. end
  37. # --- fetch list of advertisements ---
  38. def fetch_list
  39. jar = {}
  40. # 1) GET page to grab CSRF + SESSION
  41. res = Http.request(URI(PAGE), headers: { "Referer" => BASE }, jar: jar)
  42. merge_set_cookie!(jar, res)
  43. doc = Nokogiri::HTML(res.body)
  44. token = doc.at(%{meta[name="_csrf"]})&.[]("content")
  45. hdr = doc.at(%{meta[name="_csrf_header"]})&.[]("content") || "X-CSRF-TOKEN"
  46. raise "no CSRF token" unless token
  47. raise "no SESSION cookie" unless jar["SESSION"]
  48. # 2) POST listadvertisements
  49. uri = URI("#{BASE}/external/advertisement/search/listadvertisements")
  50. req = Net::HTTP::Post.new(uri)
  51. req["Content-Type"] = "application/json"
  52. req["X-Requested-With"] = "XMLHttpRequest"
  53. req["Origin"] = BASE
  54. req["Referer"] = PAGE
  55. req[hdr] = token
  56. req["Cookie"] = cookie_header(jar)
  57. req.body = { lgas: [] }.to_json
  58. res = Net::HTTP.start(uri.host, uri.port, use_ssl: true) { |h| h.request(req) }
  59. js = JSON.parse(res.body)
  60. items = js.is_a?(Array) ? js : js["items"]
  61. [items, jar, token, hdr]
  62. end
  63. # --- fetch details — always returns a Hash ---
  64. def fetch_detail(uuid, jar, token, hdr)
  65. uri = URI("#{BASE}/external/advertisement/#{uuid}/get")
  66. req = Net::HTTP::Get.new(uri)
  67. req["X-Requested-With"] = "XMLHttpRequest"
  68. req["Referer"] = PAGE
  69. req[hdr] = token
  70. req["Cookie"] = cookie_header(jar)
  71. res = Net::HTTP.start(uri.host, uri.port, use_ssl: true) { |h| h.request(req) }
  72. body = res["Content-Encoding"] == "gzip" \
  73. ? Zlib::GzipReader.new(StringIO.new(res.body)).read \
  74. : res.body
  75. parsed = JSON.parse(body)
  76. parsed.is_a?(Hash) ? parsed : {}
  77. rescue JSON::ParserError, Zlib::Error
  78. {}
  79. end
  80. puts "Fetching PlanBuild list…"
  81. items, jar, token, hdr = fetch_list
  82. puts "Found #{items.length} items for #{TABLE}"
  83. items.each do |r|
  84. ref = r["referenceNumber"]
  85. addr = r["addressString"]
  86. desc = r["description"]
  87. start = Util.parse_epoch_ms(r["startDate"])
  88. fin = Util.parse_epoch_ms(r["endDate"])
  89. uuid = r["uuid"]
  90. next if ref.to_s.strip.empty? || addr.to_s.strip.empty?
  91. begin
  92. # derive council table from reference number (e.g. PLN-HOB-xxxx)
  93. table = Util.ref_to_table(ref)
  94. council_name = Util.ref_to_folder(ref).downcase
  95. DB.ensure_table!(table)
  96. # fetch detail
  97. detail = {}
  98. begin
  99. detail = fetch_detail(uuid, jar, token, hdr) if uuid
  100. rescue StandardError => e
  101. Log.warn "planbuild", "Detail fetch failed for #{ref}: #{e.class} #{e.message}"
  102. end
  103. Log.debug "planbuild", "#{ref} -> #{table}, detail keys: #{detail.keys.join(", ")}"
  104. # handle attachments
  105. saved_paths = []
  106. if DOWNLOAD_ATTACHMENTS && uuid && detail["attachments"]&.any?
  107. dir = File.join(DOWNLOAD_DIR, council_name, ref.gsub(/[^0-9a-zA-Z_-]/, "_"))
  108. FileUtils.mkdir_p(dir)
  109. (detail["attachments"] || []).each do |att|
  110. att_id = att["id"]
  111. title = att["documentTitle"].to_s.gsub(/[^\w\-.]+/, "_")
  112. pdf_url = "#{BASE}/external/advertisement/#{uuid}/attachment/#{att_id}"
  113. path = File.join(dir, "#{title}.pdf")
  114. att_uri = URI(pdf_url)
  115. att_req = Net::HTTP::Get.new(att_uri)
  116. att_req["Cookie"] = cookie_header(jar)
  117. att_req["Referer"] = "#{BASE}/external/advertisement/#{uuid}"
  118. att_res = Net::HTTP.start(att_uri.host, att_uri.port, use_ssl: true) { |h| h.request(att_req) }
  119. File.binwrite(path, att_res.body)
  120. saved_paths << path
  121. rescue StandardError => e
  122. Log.warn "planbuild", "Attachment download failed for #{ref} att #{att["id"]}: #{e.class} #{e.message}"
  123. end
  124. end
  125. local_url = saved_paths.empty? ? nil : saved_paths.first.sub(DOWNLOAD_DIR, "/files")
  126. # upsert
  127. DB.upsert(table, {
  128. description: desc,
  129. date_received: start,
  130. date_received_raw: start&.strftime("%Y-%m-%d"),
  131. on_notice_to: fin,
  132. on_notice_to_raw: fin&.strftime("%Y-%m-%d"),
  133. address: addr[0, 255],
  134. council_reference: ref[0, 100],
  135. applicant: detail["applicant"],
  136. owner: detail["owner"],
  137. local_document_url: local_url
  138. })
  139. enrich_after_upsert!(
  140. table: table,
  141. council_reference: ref,
  142. address: addr
  143. )
  144. Log.info "planbuild", "Upserted #{ref} -> #{addr} into #{table} (PDFs: #{saved_paths.length})"
  145. rescue StandardError => e
  146. Log.warn "planbuild", "Skipping #{ref}: #{e.class} #{e.message}"
  147. end
  148. end
  149. puts "Done #{TABLE}."