planbuild.rb 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190
  1. # scrapers/planbuild.rb
  2. require "date"
  3. require "json"
  4. require "nokogiri"
  5. require "uri"
  6. require "net/http"
  7. require "open-uri"
  8. require_relative "../lib/http"
  9. require_relative "../lib/db"
  10. require_relative "../lib/util"
  11. require_relative "../lib/geocode"
  12. require_relative "../lib/enrich"
  13. require "fileutils"
  14. TABLE = ENV.fetch("TABLE_NAME")
  15. BASE = "https://portal.planbuild.tas.gov.au"
  16. PAGE = "#{BASE}/external/advertisement/search"
  17. DOWNLOAD_ATTACHMENTS = ENV["DOWNLOAD_ATTACHMENTS"] == "1"
  18. DOWNLOAD_DIR = ENV["DOWNLOAD_DIR"] || "/app/downloads"
  19. DB.ensure_table!(TABLE)
  20. # --- cookie + csrf helpers ---
  21. def merge_set_cookie!(jar, res)
  22. (res.get_fields("set-cookie") || []).each do |raw|
  23. raw.split(/,(?=[^;]+?=)/).each do |c|
  24. if c =~ /\A\s*([^=;,\s]+)\s*=\s*([^;,\s]+)/
  25. jar[$1] = $2
  26. end
  27. end
  28. end
  29. end
  30. def cookie_header(jar)
  31. base = "accepted=1; disclaimerAccepted=true; insecureSiteWideBanner=1"
  32. more = jar.map { |k, v| "#{k}=#{v}" }.join("; ")
  33. [base, more].reject(&:empty?).join("; ")
  34. end
  35. # --- fetch list of advertisements ---
  36. def fetch_list
  37. jar = {}
  38. # 1) GET page to grab CSRF + SESSION
  39. res = Http.request(URI(PAGE), headers: { "Referer" => BASE }, jar: jar)
  40. merge_set_cookie!(jar, res)
  41. doc = Nokogiri::HTML(res.body)
  42. token = doc.at(%{meta[name="_csrf"]})&.[]("content")
  43. hdr = doc.at(%{meta[name="_csrf_header"]})&.[]("content") || "X-CSRF-TOKEN"
  44. raise "no CSRF token" unless token
  45. raise "no SESSION cookie" unless jar["SESSION"]
  46. # 2) POST listadvertisements
  47. uri = URI("#{BASE}/external/advertisement/search/listadvertisements")
  48. req = Net::HTTP::Post.new(uri)
  49. req["Content-Type"] = "application/json"
  50. req["X-Requested-With"] = "XMLHttpRequest"
  51. req["Origin"] = BASE
  52. req["Referer"] = PAGE
  53. req[hdr] = token
  54. req["Cookie"] = cookie_header(jar)
  55. req.body = { lgas: [] }.to_json
  56. res = Net::HTTP.start(uri.host, uri.port, use_ssl: true) { |h| h.request(req) }
  57. js = JSON.parse(res.body)
  58. items = js.is_a?(Array) ? js : js["items"]
  59. [items, jar, token, hdr]
  60. end
  61. # --- fetch details ---
  62. def fetch_detail(uuid, jar, token, hdr)
  63. uri = URI("#{BASE}/external/advertisement/#{uuid}/get")
  64. req = Net::HTTP::Get.new(uri)
  65. req["X-Requested-With"] = "XMLHttpRequest"
  66. req["Referer"] = PAGE
  67. req[hdr] = token
  68. req["Cookie"] = cookie_header(jar)
  69. res = Net::HTTP.start(uri.host, uri.port, use_ssl: true) { |h| h.request(req) }
  70. # decompress if gzip
  71. body = res['Content-Encoding'] == 'gzip' ? Zlib::GzipReader.new(StringIO.new(res.body)).read : res.body
  72. JSON.parse(body) rescue {}
  73. end
  74. puts "Fetching PlanBuild list…"
  75. items, jar, token, hdr = fetch_list
  76. puts "Found #{items.length} items for #{TABLE}"
  77. items.each do |r|
  78. ref = r["referenceNumber"]
  79. addr = r["addressString"]
  80. desc = r["description"]
  81. start = Util.parse_epoch_ms(r["startDate"])
  82. fin = Util.parse_epoch_ms(r["endDate"])
  83. uuid = r["uuid"]
  84. next if ref.to_s.strip.empty? || addr.to_s.strip.empty?
  85. # derive council code & table
  86. council_code = ref.split("-")[1].to_s.upcase # e.g. PLN-HOB-xxxx → HOB
  87. table = Util.ref_to_table(ref)
  88. council_name = Util.ref_to_folder(ref).downcase # use for file path
  89. DB.ensure_table!(table)
  90. # --- fetch detail page ---
  91. detail = {}
  92. begin
  93. detail = fetch_detail(uuid, jar, token, hdr) if uuid
  94. rescue => e
  95. warn "Detail fetch failed for #{ref}: #{e.class} #{e.message}"
  96. end
  97. puts "Council: #{table}"
  98. puts "DETAIL for #{ref}: keys=#{detail.keys}"
  99. if detail["attachments"]&.any?
  100. puts "Attachments: #{detail['attachments'].map { |a| "id=#{a['id']}, title=#{a['documentTitle']}" }}"
  101. else
  102. puts "Attachments: none"
  103. end
  104. # --- handle attachments ---
  105. # --- handle attachments ---
  106. saved_paths = []
  107. if DOWNLOAD_ATTACHMENTS && uuid && detail["attachments"]&.any?
  108. dir = File.join(DOWNLOAD_DIR, council_name, ref.gsub(/[^0-9a-zA-Z_-]/, "_"))
  109. FileUtils.mkdir_p(dir)
  110. (detail["attachments"] || []).each do |att|
  111. att_id = att["id"]
  112. title = att["documentTitle"]
  113. pdf_url = "#{BASE}/external/advertisement/#{uuid}/attachment/#{att_id}"
  114. path = File.join(dir, "#{title.gsub(/[^\w\-.]+/, '_')}.pdf")
  115. uri = URI(pdf_url)
  116. req = Net::HTTP::Get.new(uri)
  117. req["Cookie"] = cookie_header(jar)
  118. req["Referer"] = "#{BASE}/external/advertisement/#{uuid}"
  119. res = Net::HTTP.start(uri.host, uri.port, use_ssl: true) { |h| h.request(req) }
  120. File.binwrite(path, res.body)
  121. saved_paths << path
  122. end
  123. # store first PDF relative path in DB
  124. if saved_paths.any?
  125. first_web_rel = saved_paths.first.sub(DOWNLOAD_DIR, "/files")
  126. DB.client.prepare("UPDATE `#{table}` SET local_document_url = ? WHERE council_reference = ?")
  127. .execute(first_web_rel, ref)
  128. end
  129. end
  130. # geocode
  131. geo = nil
  132. begin
  133. geo = Geocode.format_au(addr)
  134. rescue => e
  135. warn "Geocode error for #{ref}: #{e.class} #{e.message}"
  136. end
  137. # --- upsert into DB ---
  138. DB.upsert(table, {
  139. description: desc,
  140. date_received: start,
  141. date_received_raw: start&.strftime("%Y-%m-%d"),
  142. on_notice_to: fin,
  143. on_notice_to_raw: fin&.strftime("%Y-%m-%d"),
  144. address: addr[0,255],
  145. council_reference: ref[0,100],
  146. applicant: detail["applicant"],
  147. owner: detail["owner"],
  148. local_document_url: saved_paths.join(", ")
  149. })
  150. enrich_after_upsert!(
  151. table: table,
  152. council_reference: ref,
  153. address: addr
  154. )
  155. puts "Upserted #{ref} -> #{addr} into #{table}, PDFs: #{saved_paths.length}"
  156. end
  157. puts "Done #{TABLE}."