huonvalley.rb 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148
  1. # Huon Valley Council — Advertised Applications (site page, not PlanBuild)
  2. # Source: https://www.huonvalley.tas.gov.au/development/planning/advertised-applications/
  3. require "nokogiri"
  4. require "cgi"
  5. require_relative "../lib/http"
  6. require_relative "../lib/db"
  7. require_relative "../lib/util"
  8. require_relative "../lib/enrich"
  9. TABLE = ENV.fetch("TABLE_NAME") # run_all.sh -> da_huonvalley
  10. START_URL = "https://www.huonvalley.tas.gov.au/development/planning/advertised-applications/"
  11. DB.ensure_table!(TABLE)
  12. # Optional: keep the SharePoint link
  13. begin
  14. DB.client.query("ALTER TABLE `#{DB.client.escape(TABLE)}` ADD COLUMN IF NOT EXISTS document_url TEXT NULL")
  15. rescue StandardError => e
  16. warn "document_url add skipped: #{e.class} #{e.message}"
  17. end
  18. REF_RX = %r{\bDA[-\s]?\d{1,4}/20\d{2}\b}i
  19. def abs_url(base, href)
  20. return "" if href.to_s.strip.empty?
  21. URI.join(base, href).to_s rescue href.to_s
  22. end
  23. def nearest_heading_text(node)
  24. h = node.xpath("preceding::h2[1] | preceding::h3[1]").first
  25. h ? h.text.strip : ""
  26. end
  27. def proposal_between_heading_and(node)
  28. # Walk back to the nearest heading, then take the first non-empty text sibling after it
  29. h = node.xpath("preceding::h2[1] | preceding::h3[1]").first
  30. return "" unless h
  31. sib = h
  32. 12.times do
  33. sib = sib.next_element
  34. break if sib.nil?
  35. t = sib.text.strip.gsub(/\s+/, " ")
  36. next if t.empty? || t.match?(/More Information/i) || t.match?(/Available Documents/i)
  37. return t
  38. end
  39. ""
  40. end
  41. def parse_page(html, base_url)
  42. doc = Nokogiri::HTML(html)
  43. # Each application has a SharePoint doc link labeled “Copy of application for viewing”
  44. anchors = doc.css("a").select { |a|
  45. href = a["href"].to_s
  46. a.text.to_s.strip.match?(/copy of application for viewing/i) || href.match?(/huonvalleycouncil\.sharepoint\.com/i)
  47. }
  48. rows = []
  49. anchors.each do |a|
  50. document_url = abs_url(base_url, a["href"])
  51. heading = nearest_heading_text(a)
  52. ref = heading[/#{REF_RX}/]&.strip || ""
  53. # Get a one-line proposal that appears just after the heading
  54. description = proposal_between_heading_and(a)
  55. description = "Development Application" if description.empty?
  56. # Address sometimes appears in the proposal. If not, keep a readable fallback.
  57. address = if description.match?(/\d+ .*?\b(TAS|Huon|Franklin|Cygnet|Dover|Ranelagh)\b/i)
  58. description
  59. else
  60. heading
  61. end
  62. next if ref.empty? || address.empty?
  63. rows << {
  64. council_reference: ref,
  65. address: address,
  66. description: description,
  67. date_received_raw: "",
  68. date_received: nil,
  69. document_url: document_url
  70. }
  71. end
  72. # Find a Next link for pagination
  73. next_href = nil
  74. if (next_a = doc.css("a").find { |x| x.text.to_s.strip.downcase == "next" })
  75. next_href = abs_url(base_url, next_a["href"])
  76. end
  77. [rows, next_href]
  78. end
  79. saved = 0
  80. url = START_URL
  81. seen_refs = {}
  82. loop do
  83. begin
  84. html = Http.get(url)
  85. rescue StandardError => e
  86. warn "Failed to fetch #{url}: #{e.class} #{e.message}"
  87. break
  88. end
  89. rows, next_url = parse_page(html, url)
  90. rows.each do |r|
  91. # de-dup within a run
  92. next if seen_refs[[r[:council_reference], r[:address]]]
  93. seen_refs[[r[:council_reference], r[:address]]] = true
  94. DB.upsert(TABLE, {
  95. description: r[:description],
  96. date_received: r[:date_received],
  97. date_received_raw: r[:date_received_raw],
  98. address: r[:address],
  99. council_reference: r[:council_reference],
  100. applicant: "",
  101. owner: ""
  102. })
  103. enrich_after_upsert!(
  104. table: TABLE,
  105. council_reference: council_reference,
  106. address: address
  107. )
  108. begin
  109. upd = DB.client.prepare("UPDATE `#{DB.client.escape(TABLE)}` SET document_url = ? WHERE council_reference = ? AND address = ?")
  110. upd.execute(r[:document_url], r[:council_reference], r[:address])
  111. rescue Mysql2::Error => e
  112. warn "[huonvalley] db update skipped for #{r[:council_reference]}: #{e.message}"
  113. end
  114. puts "Upserted #{r[:council_reference]} -> #{r[:address]}"
  115. saved += 1
  116. end
  117. break if next_url.nil? || next_url == url
  118. url = next_url
  119. end
  120. puts "Done #{TABLE}. Saved #{saved} item(s)."