huonvalley.rb 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. # Huon Valley Council — Advertised Applications (site page, not PlanBuild)
  2. # Source: https://www.huonvalley.tas.gov.au/development/planning/advertised-applications/
  3. require "nokogiri"
  4. require "cgi"
  5. require_relative "../lib/enrich"
  6. require_relative "../lib/log"
  7. TABLE = ENV.fetch("TABLE_NAME") # run_all.sh -> da_huonvalley
  8. START_URL = "https://www.huonvalley.tas.gov.au/development/planning/advertised-applications/"
  9. DB.ensure_table!(TABLE)
  10. REF_RX = %r{\bDA[-\s]?\d{1,4}/20\d{2}\b}i
  11. def abs_url(base, href)
  12. return "" if href.to_s.strip.empty?
  13. URI.join(base, href).to_s rescue href.to_s
  14. end
  15. def nearest_heading_text(node)
  16. h = node.xpath("preceding::h2[1] | preceding::h3[1]").first
  17. h ? h.text.strip : ""
  18. end
  19. def proposal_between_heading_and(node)
  20. # Walk back to the nearest heading, then take the first non-empty text sibling after it
  21. h = node.xpath("preceding::h2[1] | preceding::h3[1]").first
  22. return "" unless h
  23. sib = h
  24. 12.times do
  25. sib = sib.next_element
  26. break if sib.nil?
  27. t = sib.text.strip.gsub(/\s+/, " ")
  28. next if t.empty? || t.match?(/More Information/i) || t.match?(/Available Documents/i)
  29. return t
  30. end
  31. ""
  32. end
  33. def parse_page(html, base_url)
  34. doc = Nokogiri::HTML(html)
  35. # Each application has a SharePoint doc link labeled “Copy of application for viewing”
  36. anchors = doc.css("a").select { |a|
  37. href = a["href"].to_s
  38. a.text.to_s.strip.match?(/copy of application for viewing/i) || href.match?(/huonvalleycouncil\.sharepoint\.com/i)
  39. }
  40. rows = []
  41. anchors.each do |a|
  42. document_url = abs_url(base_url, a["href"])
  43. heading = nearest_heading_text(a)
  44. ref = heading[/#{REF_RX}/]&.strip || ""
  45. # Get a one-line proposal that appears just after the heading
  46. description = proposal_between_heading_and(a)
  47. description = "Development Application" if description.empty?
  48. # Address sometimes appears in the proposal. If not, keep a readable fallback.
  49. address = if description.match?(/\d+ .*?\b(TAS|Huon|Franklin|Cygnet|Dover|Ranelagh)\b/i)
  50. description
  51. else
  52. heading
  53. end
  54. next if ref.empty? || address.empty?
  55. rows << {
  56. council_reference: ref,
  57. address: address,
  58. description: description,
  59. date_received_raw: "",
  60. date_received: nil,
  61. document_url: document_url
  62. }
  63. end
  64. # Find a Next link for pagination
  65. next_href = nil
  66. if (next_a = doc.css("a").find { |x| x.text.to_s.strip.downcase == "next" })
  67. next_href = abs_url(base_url, next_a["href"])
  68. end
  69. [rows, next_href]
  70. end
  71. saved = 0
  72. url = START_URL
  73. seen_refs = {}
  74. loop do
  75. begin
  76. html = Http.get(url)
  77. rescue StandardError => e
  78. Log.warn "scraper", "Failed to fetch #{url}: #{e.class} #{e.message}"
  79. break
  80. end
  81. rows, next_url = parse_page(html, url)
  82. rows.each do |r|
  83. # de-dup within a run
  84. next if seen_refs[[r[:council_reference], r[:address]]]
  85. seen_refs[[r[:council_reference], r[:address]]] = true
  86. DB.upsert(TABLE, {
  87. description: r[:description],
  88. date_received: r[:date_received],
  89. date_received_raw: r[:date_received_raw],
  90. address: r[:address],
  91. council_reference: r[:council_reference],
  92. document_url: r[:document_url],
  93. applicant: "",
  94. owner: ""
  95. })
  96. enrich_after_upsert!(
  97. table: TABLE,
  98. council_reference: r[:council_reference],
  99. address: r[:address]
  100. )
  101. puts "Upserted #{r[:council_reference]} -> #{r[:address]}"
  102. saved += 1
  103. end
  104. break if next_url.nil? || next_url == url
  105. url = next_url
  106. end
  107. puts "Done #{TABLE}. Saved #{saved} item(s)."