westtamar.rb 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169
  1. # West Tamar Council — Advertised Planning Applications
  2. #
  3. # Source: https://www.wtc.tas.gov.au/advertised-planning-applications/
  4. #
  5. # Page structure — all entries on one page, grouped by h2 headings:
  6. #
  7. # <h2>92 Sunset Boulevard, Clarence Point</h2>
  8. # <p>
  9. # <strong>APPLICANT:</strong> J & E West<br>
  10. # <strong>PROPOSAL:</strong> Residential - Dwelling & Outbuilding<br>
  11. # <strong>LOCATION:</strong> 92 Sunset Boulevard, Clarence Point<br>
  12. # <strong>CLOSES:</strong> 5pm on 16 April 2026
  13. # </p>
  14. # <ul>
  15. # <li>Application Number: PA NO: 2025065</li>
  16. # <li>Closes 16 April 2026</li>
  17. # </ul>
  18. # <p><a href="https://assets.wtc.tas.gov.au/...PA2025065...pdf">Proposal description</a></p>
  19. require "nokogiri"
  20. require "uri"
  21. require "fileutils"
  22. require_relative "../lib/scraper_helpers"
  23. require_relative "../lib/util"
  24. require_relative "../lib/log"
  25. TABLE = ENV.fetch("TABLE_NAME")
  26. URL = "https://www.wtc.tas.gov.au/advertised-planning-applications/"
  27. DOWNLOAD_ATTACHMENTS = ENV["DOWNLOAD_ATTACHMENTS"] == "1"
  28. DOWNLOAD_DIR = ENV["DOWNLOAD_DIR"] || "/app/downloads"
  29. DB.ensure_table!(TABLE)
  30. def safe_name(s) = s.to_s.gsub(/[^\w\-.]+/, "_")
  31. def download_pdf(url, council_reference)
  32. return nil if url.to_s.strip.empty?
  33. dir = File.join(DOWNLOAD_DIR, "westtamar", safe_name(council_reference))
  34. FileUtils.mkdir_p(dir)
  35. fname = safe_name(File.basename(URI.parse(url).path))
  36. fname = "document.pdf" if fname.empty?
  37. path = File.join(dir, fname)
  38. body = Http.get(url, headers: { "Accept" => "application/pdf,*/*", "Referer" => URL })
  39. File.binwrite(path, body)
  40. puts " saved #{fname} (#{body.bytesize} bytes)"
  41. "/files/westtamar/#{safe_name(council_reference)}/#{fname}"
  42. rescue StandardError => e
  43. Log.warn "westtamar", "Download failed for #{url}: #{e.class} #{e.message}"
  44. nil
  45. end
  46. # Parse "<strong>KEY:</strong> VALUE<br>" pairs from a <p> node
  47. def parse_strong_labels(p_node)
  48. kv = {}
  49. return kv unless p_node
  50. # Replace <br> with newlines so we can split cleanly
  51. html = p_node.inner_html.gsub(/<br\s*\/?>/i, "\n")
  52. Nokogiri::HTML.fragment(html).text.split("\n").each do |line|
  53. line = line.gsub(/\u00a0|\s+/, " ").strip
  54. next if line.empty?
  55. if (m = line.match(/\A([A-Z][A-Z\s]{1,20}):\s*(.+)\z/))
  56. kv[m[1].strip.upcase] = m[2].strip
  57. end
  58. end
  59. kv
  60. end
  61. html = Http.get(URL)
  62. doc = Nokogiri::HTML(html)
  63. items = []
  64. # Walk h2 elements; collect their following siblings until the next h2
  65. doc.css("h2").each do |h2|
  66. sibling_nodes = []
  67. sib = h2.next_sibling
  68. while sib
  69. break if sib.element? && sib.name == "h2"
  70. sibling_nodes << sib if sib.element?
  71. sib = sib.next_sibling
  72. end
  73. next if sibling_nodes.empty?
  74. # Find the <p> containing APPLICANT/PROPOSAL/LOCATION/CLOSES labels
  75. label_p = sibling_nodes.find { |n| n.name == "p" && n.text =~ /APPLICANT|PROPOSAL|LOCATION|CLOSES/i }
  76. kv = parse_strong_labels(label_p)
  77. # Find the <ul> containing the application number
  78. ul_node = sibling_nodes.find { |n| n.name == "ul" }
  79. ul_text = ul_node&.text.to_s.gsub(/\u00a0|\s+/, " ")
  80. # PDF link lives inside a <li> within the <ul>
  81. pdf_link = ul_node&.css("li a[href]")&.find { |a| a["href"].to_s =~ /\.pdf/i }
  82. # Fallback: any element in the section with a .pdf href
  83. pdf_link ||= sibling_nodes.flat_map { |n| n.css("a[href]").to_a }
  84. .find { |a| a["href"].to_s =~ /\.pdf/i }
  85. # --- Reference: "PA NO: 2025065" from ul, or filename ---
  86. ref = nil
  87. if (m = ul_text.to_s.match(/PA\s*(?:NO:?)?\s*(\d{5,})/i))
  88. ref = "PA #{m[1]}"
  89. end
  90. if ref.nil? && pdf_link
  91. href = pdf_link["href"].to_s
  92. ref = href.match(/PA(\d{5,})/i)&.then { |mm| "PA #{mm[1]}" }
  93. end
  94. next unless ref
  95. # --- Address from LOCATION label, fallback to h2 text ---
  96. address = kv["LOCATION"] || kv["ADDRESS"] || h2.text.gsub(/\u00a0|\s+/, " ").strip
  97. next if address.empty?
  98. # --- Other fields ---
  99. applicant = kv["APPLICANT"].to_s
  100. description = kv["PROPOSAL"].to_s
  101. description = "Development Application" if description.empty?
  102. closes_raw = kv["CLOSES"].to_s
  103. # Strip time prefix: "5pm on 16 April 2026" → "16 April 2026"
  104. closes_raw = closes_raw.sub(/\A.*?\bon\s+/i, "").strip
  105. # Also try list item: "Closes 16 April 2026"
  106. if closes_raw.empty? && (m = ul_text.match(/Closes?\s+(\d{1,2}\s+[A-Za-z]+\s+\d{4})/i))
  107. closes_raw = m[1]
  108. end
  109. on_notice_to = Util.parse_aus_date(closes_raw)
  110. document_url = pdf_link ? abs_url(URL, pdf_link["href"].to_s) : ""
  111. items << {
  112. council_reference: ref,
  113. address: address,
  114. description: description,
  115. applicant: applicant,
  116. on_notice_to: on_notice_to,
  117. on_notice_to_raw: closes_raw,
  118. document_url: document_url
  119. }
  120. end
  121. puts "Found #{items.length} item(s) for #{TABLE}"
  122. items.each do |r|
  123. local_url = DOWNLOAD_ATTACHMENTS ? download_pdf(r[:document_url], r[:council_reference]) : nil
  124. upsert_and_enrich!(
  125. table: TABLE,
  126. row: {
  127. council_reference: r[:council_reference],
  128. address: r[:address],
  129. description: r[:description],
  130. applicant: r[:applicant],
  131. on_notice_to: r[:on_notice_to],
  132. on_notice_to_raw: r[:on_notice_to_raw],
  133. owner: ""
  134. },
  135. extras: {
  136. document_url: r[:document_url],
  137. local_document_url: local_url
  138. }
  139. )
  140. end
  141. puts "Done #{TABLE}. Saved #{items.length} item(s)."