huonvalley.rb 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148
  1. # Huon Valley Council — Advertised Applications (site page, not PlanBuild)
  2. # Source: https://www.huonvalley.tas.gov.au/development/planning/advertised-applications/
  3. #
  4. # Page structure per application (flat siblings, no wrapper div):
  5. # <h2>DA-37/2026</h2>
  6. # <p>Description, Address (CT-land-title-ref)</p>
  7. # <h3>More Information</h3>
  8. # <a href="mapbox...">...</a>
  9. # <h3>Available Documents:</h3>
  10. # <a href="sharepoint...">Copy of application for viewing</a>
  11. require "nokogiri"
  12. require "uri"
  13. require "cgi"
  14. require_relative "../lib/http"
  15. require_relative "../lib/db"
  16. require_relative "../lib/util"
  17. require_relative "../lib/enrich"
  18. require_relative "../lib/log"
  19. TABLE = ENV.fetch("TABLE_NAME") # run_all.sh -> da_huonvalley
  20. START_URL = "https://www.huonvalley.tas.gov.au/development/planning/advertised-applications/"
  21. DB.ensure_table!(TABLE)
  22. # DA-37/2026 or DA 37/2026 (number/year order)
  23. REF_RX = /\bDA[-\s]?\d{1,4}\/20\d{2}\b/i
  24. def abs_url(base, href)
  25. return nil if href.to_s.strip.empty?
  26. URI.join(base, href).to_s
  27. rescue URI::InvalidURIError
  28. nil
  29. end
  30. def parse_page(html, base_url)
  31. doc = Nokogiri::HTML(html)
  32. rows = []
  33. # Drive from each plain <h2> whose text matches the DA ref pattern
  34. doc.css("h2").each do |h2|
  35. ref = h2.text.strip
  36. next unless ref.match?(REF_RX)
  37. desc_addr = nil
  38. document_url = nil
  39. sib = h2.next_element
  40. 15.times do
  41. break if sib.nil?
  42. # First <p> after the heading holds description + address
  43. if sib.name == "p" && desc_addr.nil?
  44. desc_addr = sib.text.strip.gsub(/\s+/, " ")
  45. end
  46. # Document link follows <h3>Available Documents:</h3>
  47. if sib.name == "a" && sib.text.strip.match?(/copy of application for viewing/i)
  48. document_url = abs_url(base_url, sib["href"])
  49. break
  50. end
  51. # Stop at the next application's <h2>
  52. break if sib.name == "h2" && sib.text.strip.match?(REF_RX)
  53. sib = sib.next_element
  54. end
  55. next if desc_addr.nil? || desc_addr.empty?
  56. # Split "Dwelling, outbuilding..., 100 Turners Road, Cradoc (CT-237651/1)"
  57. # into description and address at the first ", <number> " pattern
  58. description, address = if (m = desc_addr.match(/\A(.+?),\s*(\d+\s+\S.+)\z/m))
  59. [m[1].strip, m[2].strip]
  60. else
  61. ["Development Application", desc_addr]
  62. end
  63. # Strip cadastral reference from end of address: "(CT-237651/1)"
  64. address = address.sub(/\s*\(CT-[\d\/]+\)\s*\z/, "").strip
  65. next if address.empty?
  66. rows << {
  67. council_reference: ref,
  68. address: address[0, 255],
  69. description: description,
  70. date_received_raw: "",
  71. date_received: nil,
  72. document_url: document_url
  73. }
  74. end
  75. # Pagination: find a "Next" link
  76. next_href = nil
  77. if (next_a = doc.css("a").find { |a| a.text.strip.downcase == "next" })
  78. next_href = abs_url(base_url, next_a["href"])
  79. end
  80. [rows, next_href]
  81. end
  82. saved = 0
  83. url = START_URL
  84. seen = {}
  85. loop do
  86. html = begin
  87. Http.get(url)
  88. rescue StandardError => e
  89. Log.warn "huonvalley", "Failed to fetch #{url}: #{e.class} #{e.message}"
  90. break
  91. end
  92. rows, next_url = parse_page(html, url)
  93. puts "Found #{rows.length} item(s) on #{url}"
  94. rows.each do |r|
  95. key = [r[:council_reference], r[:address]]
  96. next if seen[key]
  97. seen[key] = true
  98. begin
  99. DB.upsert(TABLE, {
  100. description: r[:description],
  101. date_received: r[:date_received],
  102. date_received_raw: r[:date_received_raw],
  103. address: r[:address],
  104. council_reference: r[:council_reference],
  105. document_url: r[:document_url],
  106. applicant: "",
  107. owner: ""
  108. })
  109. enrich_after_upsert!(
  110. table: TABLE,
  111. council_reference: r[:council_reference],
  112. address: r[:address]
  113. )
  114. Log.info "huonvalley", "Upserted #{r[:council_reference]} -> #{r[:address]}"
  115. saved += 1
  116. rescue StandardError => e
  117. Log.warn "huonvalley", "DB error for #{r[:council_reference]}: #{e.class} #{e.message}"
  118. end
  119. end
  120. break if next_url.nil? || next_url == url
  121. url = next_url
  122. end
  123. puts "Done #{TABLE}. Saved #{saved} item(s)."