| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169 |
- # West Tamar Council — Advertised Planning Applications
- #
- # Source: https://www.wtc.tas.gov.au/advertised-planning-applications/
- #
- # Page structure — all entries on one page, grouped by h2 headings:
- #
- # <h2>92 Sunset Boulevard, Clarence Point</h2>
- # <p>
- # <strong>APPLICANT:</strong> J & E West<br>
- # <strong>PROPOSAL:</strong> Residential - Dwelling & Outbuilding<br>
- # <strong>LOCATION:</strong> 92 Sunset Boulevard, Clarence Point<br>
- # <strong>CLOSES:</strong> 5pm on 16 April 2026
- # </p>
- # <ul>
- # <li>Application Number: PA NO: 2025065</li>
- # <li>Closes 16 April 2026</li>
- # </ul>
- # <p><a href="https://assets.wtc.tas.gov.au/...PA2025065...pdf">Proposal description</a></p>
- require "nokogiri"
- require "uri"
- require "fileutils"
- require_relative "../lib/scraper_helpers"
- require_relative "../lib/util"
- require_relative "../lib/log"
- TABLE = ENV.fetch("TABLE_NAME")
- URL = "https://www.wtc.tas.gov.au/advertised-planning-applications/"
- DOWNLOAD_ATTACHMENTS = ENV["DOWNLOAD_ATTACHMENTS"] == "1"
- DOWNLOAD_DIR = ENV["DOWNLOAD_DIR"] || "/app/downloads"
- DB.ensure_table!(TABLE)
- def safe_name(s) = s.to_s.gsub(/[^\w\-.]+/, "_")
- def download_pdf(url, council_reference)
- return nil if url.to_s.strip.empty?
- dir = File.join(DOWNLOAD_DIR, "westtamar", safe_name(council_reference))
- FileUtils.mkdir_p(dir)
- fname = safe_name(File.basename(URI.parse(url).path))
- fname = "document.pdf" if fname.empty?
- path = File.join(dir, fname)
- body = Http.get(url, headers: { "Accept" => "application/pdf,*/*", "Referer" => URL })
- File.binwrite(path, body)
- puts " saved #{fname} (#{body.bytesize} bytes)"
- "/files/westtamar/#{safe_name(council_reference)}/#{fname}"
- rescue StandardError => e
- Log.warn "westtamar", "Download failed for #{url}: #{e.class} #{e.message}"
- nil
- end
- # Parse "<strong>KEY:</strong> VALUE<br>" pairs from a <p> node
- def parse_strong_labels(p_node)
- kv = {}
- return kv unless p_node
- # Replace <br> with newlines so we can split cleanly
- html = p_node.inner_html.gsub(/<br\s*\/?>/i, "\n")
- Nokogiri::HTML.fragment(html).text.split("\n").each do |line|
- line = line.gsub(/\u00a0|\s+/, " ").strip
- next if line.empty?
- if (m = line.match(/\A([A-Z][A-Z\s]{1,20}):\s*(.+)\z/))
- kv[m[1].strip.upcase] = m[2].strip
- end
- end
- kv
- end
- html = Http.get(URL)
- doc = Nokogiri::HTML(html)
- items = []
- # Walk h2 elements; collect their following siblings until the next h2
- doc.css("h2").each do |h2|
- sibling_nodes = []
- sib = h2.next_sibling
- while sib
- break if sib.element? && sib.name == "h2"
- sibling_nodes << sib if sib.element?
- sib = sib.next_sibling
- end
- next if sibling_nodes.empty?
- # Find the <p> containing APPLICANT/PROPOSAL/LOCATION/CLOSES labels
- label_p = sibling_nodes.find { |n| n.name == "p" && n.text =~ /APPLICANT|PROPOSAL|LOCATION|CLOSES/i }
- kv = parse_strong_labels(label_p)
- # Find the <ul> containing the application number
- ul_node = sibling_nodes.find { |n| n.name == "ul" }
- ul_text = ul_node&.text.to_s.gsub(/\u00a0|\s+/, " ")
- # PDF link lives inside a <li> within the <ul>
- pdf_link = ul_node&.css("li a[href]")&.find { |a| a["href"].to_s =~ /\.pdf/i }
- # Fallback: any element in the section with a .pdf href
- pdf_link ||= sibling_nodes.flat_map { |n| n.css("a[href]").to_a }
- .find { |a| a["href"].to_s =~ /\.pdf/i }
- # --- Reference: "PA NO: 2025065" from ul, or filename ---
- ref = nil
- if (m = ul_text.to_s.match(/PA\s*(?:NO:?)?\s*(\d{5,})/i))
- ref = "PA #{m[1]}"
- end
- if ref.nil? && pdf_link
- href = pdf_link["href"].to_s
- ref = href.match(/PA(\d{5,})/i)&.then { |mm| "PA #{mm[1]}" }
- end
- next unless ref
- # --- Address from LOCATION label, fallback to h2 text ---
- address = kv["LOCATION"] || kv["ADDRESS"] || h2.text.gsub(/\u00a0|\s+/, " ").strip
- next if address.empty?
- # --- Other fields ---
- applicant = kv["APPLICANT"].to_s
- description = kv["PROPOSAL"].to_s
- description = "Development Application" if description.empty?
- closes_raw = kv["CLOSES"].to_s
- # Strip time prefix: "5pm on 16 April 2026" → "16 April 2026"
- closes_raw = closes_raw.sub(/\A.*?\bon\s+/i, "").strip
- # Also try list item: "Closes 16 April 2026"
- if closes_raw.empty? && (m = ul_text.match(/Closes?\s+(\d{1,2}\s+[A-Za-z]+\s+\d{4})/i))
- closes_raw = m[1]
- end
- on_notice_to = Util.parse_aus_date(closes_raw)
- document_url = pdf_link ? abs_url(URL, pdf_link["href"].to_s) : ""
- items << {
- council_reference: ref,
- address: address,
- description: description,
- applicant: applicant,
- on_notice_to: on_notice_to,
- on_notice_to_raw: closes_raw,
- document_url: document_url
- }
- end
- puts "Found #{items.length} item(s) for #{TABLE}"
- items.each do |r|
- local_url = DOWNLOAD_ATTACHMENTS ? download_pdf(r[:document_url], r[:council_reference]) : nil
- upsert_and_enrich!(
- table: TABLE,
- row: {
- council_reference: r[:council_reference],
- address: r[:address],
- description: r[:description],
- applicant: r[:applicant],
- on_notice_to: r[:on_notice_to],
- on_notice_to_raw: r[:on_notice_to_raw],
- owner: ""
- },
- extras: {
- document_url: r[:document_url],
- local_document_url: local_url
- }
- )
- end
- puts "Done #{TABLE}. Saved #{items.length} item(s)."
|