northernmidlands.rb 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102
  1. # Northern Midlands Council — Advertised Planning Applications
  2. #
  3. # Source: https://northernmidlands.tas.gov.au/planning/development-in-the-northern-midlands/development-applications-2
  4. #
  5. # Page structure:
  6. # <h2>Closing 17 April 2026</h2>
  7. # <p>
  8. # <a href="...pdf"><strong>PLN-26-0030 - 13 Murray Street, Evandale:</strong></a>
  9. # (CT 21/1332) - Subdivision (2 Lot)
  10. # </p>
  11. require "nokogiri"
  12. require "uri"
  13. require_relative "../lib/scraper_helpers"
  14. require_relative "../lib/util"
  15. require_relative "../lib/log"
  16. TABLE = ENV.fetch("TABLE_NAME")
  17. URL = "https://northernmidlands.tas.gov.au/planning/development-in-the-northern-midlands/development-applications-2"
  18. DB.ensure_table!(TABLE)
  19. REF_RX = /\bPLN-\d{2}-\d{4}\b/i
  20. html = Http.get(URL)
  21. doc = Nokogiri::HTML(html)
  22. items = []
  23. closing_date = nil
  24. closing_date_raw = ""
  25. # Walk nodes in document order so h2 headings set the closing date for
  26. # the <p> entries that follow them.
  27. doc.css("h2, p").each do |node|
  28. if node.name == "h2"
  29. text = node.text.gsub(/\u00a0|\s+/, " ").strip
  30. if (m = text.match(/Closing\s+(.+)/i))
  31. closing_date_raw = m[1].strip
  32. closing_date = Util.parse_aus_date(closing_date_raw)
  33. end
  34. next
  35. end
  36. # <p> — look for a PLN reference inside a link
  37. link = node.at_css("a[href]")
  38. next unless link
  39. strong = node.at_css("strong")
  40. label = (strong || link).text.gsub(/\u00a0|\s+/, " ").strip
  41. # e.g. "PLN-26-0030 - 13 Murray Street, Evandale:"
  42. ref = label.match(REF_RX)&.[](0)
  43. next unless ref
  44. # Address: everything after "PLN-XX-XXXX - " with trailing colon stripped
  45. address = label.sub(/\APLN-\d{2}-\d{4}\s*-\s*/i, "").sub(/:?\s*\z/, "").strip
  46. next if address.empty?
  47. # Remainder of the <p> text (outside the link/strong) gives description + CT
  48. remainder = node.text.sub(label, "").gsub(/\u00a0|\s+/, " ").strip
  49. # e.g. "(CT 189429/1) - Multiple Dwelling (1 existing 1 new manager's residence)"
  50. title_reference = remainder.match(/CT\s+([\d\/]+)/i)&.[](1).to_s
  51. description = remainder.sub(/\A\s*\(CT[^)]*\)\s*-?\s*/i, "").strip
  52. description = "Development Application" if description.empty?
  53. document_url = abs_url(URL, link["href"].to_s)
  54. items << {
  55. council_reference: ref,
  56. address: address,
  57. description: description,
  58. on_notice_to: closing_date,
  59. on_notice_to_raw: closing_date_raw,
  60. title_reference: title_reference,
  61. document_url: document_url
  62. }
  63. end
  64. puts "Found #{items.length} item(s) for #{TABLE}"
  65. items.each do |r|
  66. upsert_and_enrich!(
  67. table: TABLE,
  68. row: {
  69. council_reference: r[:council_reference],
  70. address: r[:address],
  71. description: r[:description],
  72. on_notice_to: r[:on_notice_to],
  73. on_notice_to_raw: r[:on_notice_to_raw],
  74. title_reference: r[:title_reference],
  75. applicant: "",
  76. owner: ""
  77. },
  78. extras: {
  79. document_url: r[:document_url]
  80. }
  81. )
  82. end
  83. puts "Done #{TABLE}. Saved #{items.length} item(s)."