huonvalley.rb 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132
  1. # Huon Valley Council — Advertised Applications (site page, not PlanBuild)
  2. # Source: https://www.huonvalley.tas.gov.au/development/planning/advertised-applications/
  3. #
  4. # Page structure per application:
  5. # <div class="accordion-grid-item">
  6. # <h2 class="accordion-grid-item__title">DA-37/2026</h2>
  7. # <div class="accordion-grid-item__description">Description, Address (CT-ref)</div>
  8. # <a class="plan-file-list__item" href="sharepoint...">Copy of application for viewing</a>
  9. # </div>
  10. require "nokogiri"
  11. require "uri"
  12. require "cgi"
  13. require_relative "../lib/http"
  14. require_relative "../lib/db"
  15. require_relative "../lib/util"
  16. require_relative "../lib/enrich"
  17. require_relative "../lib/log"
  18. TABLE = ENV.fetch("TABLE_NAME") # run_all.sh -> da_huonvalley
  19. START_URL = "https://www.huonvalley.tas.gov.au/development/planning/advertised-applications/"
  20. DB.ensure_table!(TABLE)
  21. # DA-37/2026 or DA 37/2026 (number/year order)
  22. REF_RX = /\bDA[-\s]?\d{1,4}\/20\d{2}\b/i
  23. def abs_url(base, href)
  24. h = href.to_s.strip
  25. return nil if h.empty?
  26. return h if h.start_with?("http://", "https://")
  27. URI.join(base, h).to_s
  28. rescue URI::InvalidURIError
  29. h
  30. end
  31. def parse_page(html, base_url)
  32. doc = Nokogiri::HTML(html)
  33. rows = []
  34. doc.css("div.accordion-grid-item").each do |item|
  35. ref = item.at_css("h2.accordion-grid-item__title")&.text&.strip
  36. desc_addr = item.at_css("div.accordion-grid-item__description")&.text&.strip&.gsub(/\s+/, " ")
  37. doc_link = item.at_css("a.plan-file-list__item")&.[]("href")
  38. next if ref.nil? || !ref.match?(REF_RX)
  39. next if desc_addr.nil? || desc_addr.empty?
  40. document_url = abs_url(base_url, doc_link)
  41. # Split "Dwelling, outbuilding..., 100 Turners Road, Cradoc (CT-237651/1)"
  42. # into description + address at the first ", <digits> " pattern
  43. description, address = if (m = desc_addr.match(/\A(.+?),\s*(\d+\s+\S.+)\z/m))
  44. [m[1].strip, m[2].strip]
  45. else
  46. ["Development Application", desc_addr]
  47. end
  48. # Strip cadastral reference from end of address: "(CT-237651/1)"
  49. address = address.sub(/\s*\(CT-[\d\/]+\)\s*\z/, "").strip
  50. next if address.empty?
  51. rows << {
  52. council_reference: ref,
  53. address: address[0, 255],
  54. description: description,
  55. date_received_raw: "",
  56. date_received: nil,
  57. document_url: document_url
  58. }
  59. end
  60. # Pagination: find a "Next" link
  61. next_href = nil
  62. if (next_a = doc.css("a").find { |a| a.text.strip.downcase == "next" })
  63. next_href = abs_url(base_url, next_a["href"])
  64. end
  65. [rows, next_href]
  66. end
  67. saved = 0
  68. url = START_URL
  69. seen = {}
  70. loop do
  71. html = begin
  72. Http.get(url)
  73. rescue StandardError => e
  74. Log.warn "huonvalley", "Failed to fetch #{url}: #{e.class} #{e.message}"
  75. break
  76. end
  77. rows, next_url = parse_page(html, url)
  78. puts "Found #{rows.length} item(s) on #{url}"
  79. rows.each do |r|
  80. key = [r[:council_reference], r[:address]]
  81. next if seen[key]
  82. seen[key] = true
  83. begin
  84. DB.upsert(TABLE, {
  85. description: r[:description],
  86. date_received: r[:date_received],
  87. date_received_raw: r[:date_received_raw],
  88. address: r[:address],
  89. council_reference: r[:council_reference],
  90. document_url: r[:document_url],
  91. applicant: "",
  92. owner: ""
  93. })
  94. enrich_after_upsert!(
  95. table: TABLE,
  96. council_reference: r[:council_reference],
  97. address: r[:address]
  98. )
  99. Log.info "huonvalley", "Upserted #{r[:council_reference]} -> #{r[:address]}"
  100. saved += 1
  101. rescue StandardError => e
  102. Log.warn "huonvalley", "DB error for #{r[:council_reference]}: #{e.class} #{e.message}"
  103. end
  104. end
  105. break if next_url.nil? || next_url == url
  106. url = next_url
  107. end
  108. puts "Done #{TABLE}. Saved #{saved} item(s)."