southernmidlands.rb 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108
  1. # Southern Midlands Council — Advertised Development Applications
  2. # Detail pages use paragraph format: "Location: <addr>\nProposal: DA<ref> - <desc>"
  3. # One item page may contain multiple DA entries.
  4. require "nokogiri"
  5. require "uri"
  6. require "cgi"
  7. require_relative "../lib/http"
  8. require_relative "../lib/db"
  9. require_relative "../lib/util"
  10. require_relative "../lib/enrich"
  11. require_relative "../lib/log"
  12. TABLE = ENV.fetch("TABLE_NAME") # da_southernmidlands
  13. LIST_URL = "https://www.southernmidlands.tas.gov.au/advertised-development-applications/"
  14. DB.ensure_table!(TABLE)
  15. def abs_url(base, href)
  16. return "" if href.to_s.strip.empty?
  17. URI.join(base, href).to_s
  18. rescue URI::InvalidURIError
  19. href.to_s
  20. end
  21. # ---- fetch list page and collect item links ----
  22. list_html = Http.get(LIST_URL)
  23. list_doc = Nokogiri::HTML(list_html)
  24. detail_links = list_doc.css("article a[href*='?item='], article h2 a, article h3 a").map { |a|
  25. href = a["href"].to_s.strip
  26. next if href.empty? || href.start_with?("#")
  27. abs_url(LIST_URL, href)
  28. }.compact.uniq
  29. puts "Found #{detail_links.size} candidate link(s) for #{TABLE}"
  30. saved = 0
  31. detail_links.each do |url|
  32. html = begin
  33. Http.get(url)
  34. rescue StandardError => e
  35. Log.warn "southernmidlands", "Skip #{url}: #{e.class} #{e.message}"
  36. next
  37. end
  38. doc = Nokogiri::HTML(html)
  39. # Each DA entry is a <p> block containing "Location:" text.
  40. # One page may have multiple such paragraphs.
  41. doc.css("p").each do |para|
  42. # Preserve line breaks from <br> tags before stripping HTML
  43. inner = para.inner_html.gsub(/<br\s*\/?>/, "\n")
  44. text = Nokogiri::HTML.fragment(inner).text.gsub(/\r/, "").strip
  45. next unless text.match?(/Location:/i)
  46. lines = text.split("\n").map(&:strip).reject(&:empty?)
  47. loc_line = lines.find { |l| l.match?(/\ALocation:/i) }
  48. prop_line = lines.find { |l| l.match?(/\AProposal:/i) }
  49. address = loc_line&.sub(/\ALocation:\s*/i, "")&.strip.to_s
  50. proposal = prop_line&.sub(/\AProposal:\s*/i, "")&.strip.to_s
  51. next if address.empty? || proposal.empty?
  52. # Extract DA reference from proposal line (e.g. "DA2600035 - Dwelling")
  53. ref_match = proposal.match(/\b(DA\s*[\d\/]+)\b/i)
  54. council_reference = ref_match ? ref_match[1].gsub(/\s+/, "") : nil
  55. description = proposal.sub(/\A(DA\s*[\d\/]+)\s*[-:]\s*/i, "").strip
  56. if council_reference.nil? || council_reference.empty?
  57. Log.warn "southernmidlands", "No DA ref on #{url} — skipping paragraph"
  58. next
  59. end
  60. # PDF link — check this paragraph then its next sibling
  61. pdf_href = para.at_css("a[href$='.pdf'], a[href*='.pdf?']")&.[]("href")
  62. unless pdf_href
  63. sib = para.next_element
  64. pdf_href = sib&.at_css("a[href$='.pdf'], a[href*='.pdf?']")&.[]("href")
  65. end
  66. document_url = pdf_href ? abs_url(url, pdf_href) : nil
  67. begin
  68. DB.upsert(TABLE, {
  69. description: description,
  70. address: address[0, 255],
  71. council_reference: council_reference[0, 100],
  72. document_url: document_url
  73. })
  74. enrich_after_upsert!(
  75. table: TABLE,
  76. council_reference: council_reference,
  77. address: address
  78. )
  79. Log.info "southernmidlands", "Upserted #{council_reference} -> #{address}"
  80. saved += 1
  81. rescue StandardError => e
  82. Log.warn "southernmidlands", "DB error for #{council_reference}: #{e.class} #{e.message}"
  83. end
  84. end
  85. end
  86. puts "Done #{TABLE}. Saved #{saved} item(s)."