clarence.rb 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201
  1. # Clarence City Council — Advertised Plans
  2. # Source list: https://www.ccc.tas.gov.au/development/advertised-plans/
  3. require "nokogiri"
  4. require "cgi"
  5. require "uri"
  6. require "date"
  7. require "fileutils"
  8. require_relative "../lib/http"
  9. require_relative "../lib/db"
  10. require_relative "../lib/util"
  11. require_relative "../lib/geocode"
  12. require_relative "../lib/enrich"
  13. TABLE = ENV.fetch("TABLE_NAME") # run_all.sh -> da_clarence
  14. URL = "https://www.ccc.tas.gov.au/development/advertised-plans/"
  15. DOWNLOAD_ATTACHMENTS = ENV["DOWNLOAD_ATTACHMENTS"] == "1"
  16. DOWNLOAD_DIR = ENV["DOWNLOAD_DIR"] || "/app/downloads"
  17. DB.ensure_table!(TABLE)
  18. def abs_url(base, href)
  19. return "" if href.to_s.strip.empty?
  20. URI.join(base, href).to_s
  21. rescue URI::InvalidURIError
  22. href.to_s
  23. end
  24. def extract_app_number(text)
  25. text.to_s[/Application\s*Number:\s*([A-Za-z0-9\/\-\._]+)/i, 1].to_s.strip
  26. end
  27. def extract_close_raw(text)
  28. text.to_s[/Closes:\s*([^\n\r<]+)/i, 1].to_s.strip
  29. end
  30. def parse_date_token(s)
  31. s = s.to_s
  32. return $1 if s =~ /(\b\d{1,2}\/\d{1,2}\/\d{2,4}\b)/
  33. return $1 if s =~ /(\b\d{1,2}\s+[A-Za-z]{3,}\s+\d{4}\b)/
  34. return $1 if s =~ /(\b[A-Za-z]{3,}\s+\d{1,2},?\s+\d{4}\b)/
  35. ""
  36. end
  37. def looks_like_address(s)
  38. s =~ /\d{1,4}\s+\S+/ && s =~ /,\s*[A-Z][A-Z]+/
  39. end
  40. def split_title(title)
  41. parts = title.split(/\s+–\s+/) # en dash
  42. parts = title.split(/\s+-\s+/) if parts.length < 2
  43. parts.map!(&:strip)
  44. parts
  45. end
  46. def pick_address_from_title(parts)
  47. parts.find { |p| looks_like_address(p) } || parts.find { |p| p =~ /\d/ } || parts[1].to_s
  48. end
  49. def pick_description_from_title(parts, code, address)
  50. parts.find { |p| p != code && p != address && p.length > 3 }.to_s
  51. end
  52. def safe_name(s) = s.to_s.gsub(/[^\w\-.]+/, "_")
  53. # Download the PDF (if enabled) and return a web path like:
  54. # /downloads/clarence/<council_reference>/<filename.pdf>
  55. def download_pdf(url, council_reference)
  56. return nil unless DOWNLOAD_ATTACHMENTS && !url.to_s.strip.empty?
  57. folder = File.join(DOWNLOAD_DIR, "clarence", safe_name(council_reference))
  58. FileUtils.mkdir_p(folder)
  59. begin
  60. res = Http.get_response(url) rescue Http.get(url)
  61. body = res.respond_to?(:body) ? res.body : res.to_s
  62. fname = safe_name(File.basename(URI.parse(url).path))
  63. fname += ".pdf" unless fname.downcase.end_with?(".pdf")
  64. path = File.join(folder, fname)
  65. File.binwrite(path, body)
  66. puts "Saved PDF #{path}"
  67. # Web-accessible path (served by your web container)
  68. "/downloads/clarence/#{safe_name(council_reference)}/#{fname}"
  69. rescue StandardError => e
  70. Log.warn "scraper", "PDF download failed for #{url}: #{e.class} #{e.message}"
  71. nil
  72. end
  73. end
  74. list_html = Http.get(URL)
  75. doc = Nokogiri::HTML(list_html)
  76. items = []
  77. # Headings tend to be h2/h3, followed by blocks that contain
  78. # “Closes:” and “Application Number:” and a PDF link.
  79. doc.css("h2, h3").each do |h|
  80. title = h.text.to_s.strip
  81. next if title.empty?
  82. texts = []
  83. pdf_url = ""
  84. node = h
  85. 12.times do
  86. node = node.next_element
  87. break if node.nil? || node.name =~ /^h[23]$/i
  88. texts << node.text.to_s.strip
  89. if (a = node.at_css("a[href]"))
  90. href = a["href"].to_s
  91. if href =~ /\.pdf($|\?)/i || href.include?("assets.ccc.tas.gov.au")
  92. pdf_url = abs_url(URL, href)
  93. end
  94. end
  95. end
  96. detail_text = texts.join("\n")
  97. app_no_raw = extract_app_number(detail_text)
  98. closes_raw = extract_close_raw(detail_text)
  99. closes_tok = parse_date_token(closes_raw)
  100. on_notice = Util.parse_aus_date(closes_tok)
  101. parts = split_title(title)
  102. code = parts.first.to_s
  103. address = pick_address_from_title(parts).to_s
  104. desc = pick_description_from_title(parts, code, address)
  105. desc = "Development Application" if desc.strip.empty?
  106. council_reference = app_no_raw.empty? ? code : app_no_raw
  107. next if council_reference.strip.empty? || address.strip.empty?
  108. items << {
  109. council_reference: council_reference,
  110. address: address,
  111. description: desc,
  112. on_notice_raw: closes_tok,
  113. on_notice: on_notice,
  114. pdf: pdf_url,
  115. title_reference: title
  116. }
  117. end
  118. items.uniq! { |r| [r[:council_reference], r[:address]] }
  119. puts "Found #{items.length} item(s) for #{TABLE}"
  120. date_received = Date.today
  121. items.each do |r|
  122. cr = r[:council_reference].to_s
  123. addr = r[:address].to_s
  124. # Skip site promo / competitions that occasionally appear as a “heading”
  125. next if cr =~ /turn your two cents/i || r[:title_reference].to_s =~ /two cents/i
  126. # Skip if we didn’t get a sensible address
  127. next if addr.strip.empty? || addr == cr
  128. # Clarence app numbers look like PDPLANPMTD-2025/054004 etc
  129. next unless cr =~ /\APDPLAN[A-Z]*-\d{4}\/\d+\z/
  130. DB.upsert(TABLE, {
  131. description: r[:description],
  132. date_received: date_received,
  133. on_notice_to: r[:on_notice],
  134. on_notice_to_raw: r[:on_notice_raw],
  135. address: addr,
  136. council_reference: cr,
  137. applicant: "",
  138. owner: ""
  139. })
  140. enrich_after_upsert!(
  141. table: TABLE,
  142. council_reference: cr,
  143. address: addr
  144. )
  145. # Try to download and set local_document_url
  146. local_doc_url = download_pdf(r[:pdf], cr)
  147. begin
  148. upd = DB.client.prepare(
  149. "UPDATE `#{DB.client.escape(TABLE)}` " \
  150. "SET document_url = ?, " \
  151. " local_document_url = COALESCE(?, local_document_url), " \
  152. " on_notice_to = ?, on_notice_to_raw = ?, title_reference = ? " \
  153. "WHERE council_reference = ? AND address = ?"
  154. )
  155. upd.execute(r[:pdf], local_doc_url, r[:on_notice], r[:on_notice_raw], r[:title_reference], cr, addr)
  156. rescue StandardError => e
  157. Log.warn "scraper", "Extras update skipped for #{cr}: #{e.class} #{e.message}"
  158. end
  159. puts "Upserted #{cr} -> #{addr} saved: #{local_doc_url ? 1 : 0}"
  160. end
  161. puts "Done #{TABLE}."