northernmidlands.rb 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130
  1. # Northern Midlands Council — Advertised Planning Applications
  2. #
  3. # Source: https://northernmidlands.tas.gov.au/planning/development-in-the-northern-midlands/development-applications-2
  4. #
  5. # Page structure:
  6. # <h2>Closing 17 April 2026</h2>
  7. # <p>
  8. # <a href="...pdf"><strong>PLN-26-0030 - 13 Murray Street, Evandale:</strong></a>
  9. # (CT 21/1332) - Subdivision (2 Lot)
  10. # </p>
  11. require "nokogiri"
  12. require "uri"
  13. require "fileutils"
  14. require_relative "../lib/scraper_helpers"
  15. require_relative "../lib/util"
  16. require_relative "../lib/log"
  17. TABLE = ENV.fetch("TABLE_NAME")
  18. URL = "https://northernmidlands.tas.gov.au/planning/development-in-the-northern-midlands/development-applications-2"
  19. DOWNLOAD_ATTACHMENTS = ENV["DOWNLOAD_ATTACHMENTS"] == "1"
  20. DOWNLOAD_DIR = ENV["DOWNLOAD_DIR"] || "/app/downloads"
  21. DB.ensure_table!(TABLE)
  22. def safe_name(s) = s.to_s.gsub(/[^\w\-.]+/, "_")
  23. def download_pdf(url, council_reference)
  24. return nil if url.to_s.strip.empty?
  25. dir = File.join(DOWNLOAD_DIR, "northernmidlands", safe_name(council_reference))
  26. FileUtils.mkdir_p(dir)
  27. fname = safe_name(File.basename(URI.parse(url).path))
  28. fname = "document.pdf" if fname.empty?
  29. path = File.join(dir, fname)
  30. body = Http.get(url)
  31. File.binwrite(path, body)
  32. puts " saved #{fname} (#{body.bytesize} bytes)"
  33. "/downloads/northernmidlands/#{safe_name(council_reference)}/#{fname}"
  34. rescue StandardError => e
  35. Log.warn "northernmidlands", "Download failed for #{url}: #{e.class} #{e.message}"
  36. nil
  37. end
  38. REF_RX = /\bPLN-\d{2}-\d{4}\b/i
  39. html = Http.get(URL)
  40. doc = Nokogiri::HTML(html)
  41. items = []
  42. closing_date = nil
  43. closing_date_raw = ""
  44. # Walk nodes in document order so h2 headings set the closing date for
  45. # the <p> entries that follow them.
  46. doc.css("h2, p").each do |node|
  47. if node.name == "h2"
  48. text = node.text.gsub(/\u00a0|\s+/, " ").strip
  49. if (m = text.match(/Closing\s+(.+)/i))
  50. closing_date_raw = m[1].strip
  51. closing_date = Util.parse_aus_date(closing_date_raw)
  52. end
  53. next
  54. end
  55. # <p> — look for a PLN reference inside a link
  56. link = node.at_css("a[href]")
  57. next unless link
  58. strong = node.at_css("strong")
  59. label = (strong || link).text.gsub(/\u00a0|\s+/, " ").strip
  60. # e.g. "PLN-26-0030 - 13 Murray Street, Evandale:"
  61. ref = label.match(REF_RX)&.[](0)
  62. next unless ref
  63. # Address: everything after "PLN-XX-XXXX - " with trailing colon stripped
  64. address = label.sub(/\APLN-\d{2}-\d{4}\s*-\s*/i, "").sub(/:?\s*\z/, "").strip
  65. next if address.empty?
  66. # Remainder of the <p> text (outside the link/strong) gives description + CT
  67. remainder = node.text.sub(label, "").gsub(/\u00a0|\s+/, " ").strip
  68. # e.g. "(CT 189429/1) - Multiple Dwelling (1 existing 1 new manager's residence)"
  69. title_reference = remainder.match(/CT\s+([\d\/]+)/i)&.[](1).to_s
  70. description = remainder.sub(/\A\s*\(CT[^)]*\)\s*-?\s*/i, "").strip
  71. description = "Development Application" if description.empty?
  72. document_url = abs_url(URL, link["href"].to_s)
  73. items << {
  74. council_reference: ref,
  75. address: address,
  76. description: description,
  77. on_notice_to: closing_date,
  78. on_notice_to_raw: closing_date_raw,
  79. title_reference: title_reference,
  80. document_url: document_url
  81. }
  82. end
  83. puts "Found #{items.length} item(s) for #{TABLE}"
  84. items.each do |r|
  85. local_url = DOWNLOAD_ATTACHMENTS ? download_pdf(r[:document_url], r[:council_reference]) : nil
  86. upsert_and_enrich!(
  87. table: TABLE,
  88. row: {
  89. council_reference: r[:council_reference],
  90. address: r[:address],
  91. description: r[:description],
  92. on_notice_to: r[:on_notice_to],
  93. on_notice_to_raw: r[:on_notice_to_raw],
  94. title_reference: r[:title_reference],
  95. applicant: "",
  96. owner: ""
  97. },
  98. extras: {
  99. document_url: r[:document_url],
  100. local_document_url: local_url
  101. }
  102. )
  103. end
  104. puts "Done #{TABLE}. Saved #{items.length} item(s)."